use plano-orchestrator for LLM routing, remove arch-router (#886)

2026-05-10 16:22:42 +02:00 · 2026-04-15 16:41:42 -07:00 · 2026-04-15 16:41:42 -07:00 · 90b926c2ce
commit 90b926c2ce
parent 980faef6be
29 changed files with 407 additions and 1412 deletions
--- a/crates/brightstaff/src/app_state.rs
+++ b/crates/brightstaff/src/app_state.rs
@ -5,7 +5,6 @@ use common::configuration::{Agent, FilterPipeline, Listener, ModelAlias, SpanAtt
 use common::llm_providers::LlmProviders;
 use tokio::sync::RwLock;

-use crate::router::llm::RouterService;
 use crate::router::orchestrator::OrchestratorService;
 use crate::state::StateStorage;

@ -14,7 +13,6 @@ use crate::state::StateStorage;
 /// Instead of cloning 8+ individual `Arc`s per connection, a single
 /// `Arc<AppState>` is cloned once and passed to the request handler.
 pub struct AppState {
-    pub router_service: Arc<RouterService>,
    pub orchestrator_service: Arc<OrchestratorService>,
    pub model_aliases: Option<HashMap<String, ModelAlias>>,
    pub llm_providers: Arc<RwLock<LlmProviders>>,
--- a/crates/brightstaff/src/handlers/agents/selector.rs
+++ b/crates/brightstaff/src/handlers/agents/selector.rs
@ -177,6 +177,7 @@ mod tests {
            "http://localhost:8080".to_string(),
            "test-model".to_string(),
            "plano-orchestrator".to_string(),
+            crate::router::orchestrator_model_v1::MAX_TOKEN_LEN,
        ))
    }

--- a/crates/brightstaff/src/handlers/integration_tests.rs
+++ b/crates/brightstaff/src/handlers/integration_tests.rs
@ -23,6 +23,7 @@ mod tests {
            "http://localhost:8080".to_string(),
            "test-model".to_string(),
            "plano-orchestrator".to_string(),
+            crate::router::orchestrator_model_v1::MAX_TOKEN_LEN,
        ))
    }

@ -147,8 +148,8 @@ mod tests {

    #[tokio::test]
    async fn test_error_handling_flow() {
-        let router_service = create_test_orchestrator_service();
-        let agent_selector = AgentSelector::new(router_service);
+        let orchestrator_service = create_test_orchestrator_service();
+        let agent_selector = AgentSelector::new(orchestrator_service);

        // Test listener not found
        let result = agent_selector.find_listener(Some("nonexistent"), &[]);
--- a/crates/brightstaff/src/handlers/llm/mod.rs
+++ b/crates/brightstaff/src/handlers/llm/mod.rs
@ -22,7 +22,6 @@ pub(crate) mod model_selection;

 use crate::app_state::AppState;
 use crate::handlers::agents::pipeline::PipelineProcessor;
-use crate::handlers::extract_or_generate_traceparent;
 use crate::handlers::extract_request_id;
 use crate::handlers::full;
 use crate::state::response_state_processor::ResponsesStateProcessor;
@ -92,22 +91,20 @@ async fn llm_chat_inner(
        }
    });

-    let traceparent = extract_or_generate_traceparent(&request_headers);
-
    // Session pinning: extract session ID and check cache before routing
    let session_id: Option<String> = request_headers
        .get(MODEL_AFFINITY_HEADER)
        .and_then(|h| h.to_str().ok())
        .map(|s| s.to_string());
    let tenant_id: Option<String> = state
-        .router_service
+        .orchestrator_service
        .tenant_header()
        .and_then(|hdr| request_headers.get(hdr))
        .and_then(|v| v.to_str().ok())
        .map(|s| s.to_string());
    let pinned_model: Option<String> = if let Some(ref sid) = session_id {
        state
-            .router_service
+            .orchestrator_service
            .get_cached_route(sid, tenant_id.as_deref())
            .await
            .map(|c| c.model_name)
@ -287,9 +284,8 @@ async fn llm_chat_inner(
        let routing_result = match async {
            set_service_name(operation_component::ROUTING);
            router_chat_get_upstream_model(
-                Arc::clone(&state.router_service),
+                Arc::clone(&state.orchestrator_service),
                client_request,
-                &traceparent,
                &request_path,
                &request_id,
                inline_routing_preferences,
@ -315,10 +311,9 @@ async fn llm_chat_inner(
            alias_resolved_model.clone()
        };

-        // Cache the routing decision so subsequent requests with the same session ID are pinned
        if let Some(ref sid) = session_id {
            state
-                .router_service
+                .orchestrator_service
                .cache_route(sid.clone(), tenant_id.as_deref(), model.clone(), route_name)
                .await;
        }
--- a/crates/brightstaff/src/handlers/llm/model_selection.rs
+++ b/crates/brightstaff/src/handlers/llm/model_selection.rs
@ -5,7 +5,7 @@ use hyper::StatusCode;
 use std::sync::Arc;
 use tracing::{debug, info, warn};

-use crate::router::llm::RouterService;
+use crate::router::orchestrator::OrchestratorService;
 use crate::streaming::truncate_message;
 use crate::tracing::routing;

@ -37,9 +37,8 @@ impl RoutingError {
 /// * `Ok(RoutingResult)` - Contains the selected model name and span ID
 /// * `Err(RoutingError)` - Contains error details and optional span ID
 pub async fn router_chat_get_upstream_model(
-    router_service: Arc<RouterService>,
+    orchestrator_service: Arc<OrchestratorService>,
    client_request: ProviderRequestType,
-    traceparent: &str,
    request_path: &str,
    request_id: &str,
    inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
@ -99,11 +98,9 @@ pub async fn router_chat_get_upstream_model(
    // Capture start time for routing span
    let routing_start_time = std::time::Instant::now();

-    // Attempt to determine route using the router service
-    let routing_result = router_service
+    let routing_result = orchestrator_service
        .determine_route(
            &chat_request.messages,
-            traceparent,
            inline_routing_preferences,
            request_id,
        )
--- a/crates/brightstaff/src/handlers/routing_service.rs
+++ b/crates/brightstaff/src/handlers/routing_service.rs
@ -12,7 +12,7 @@ use tracing::{debug, info, info_span, warn, Instrument};

 use super::extract_or_generate_traceparent;
 use crate::handlers::llm::model_selection::router_chat_get_upstream_model;
-use crate::router::llm::RouterService;
+use crate::router::orchestrator::OrchestratorService;
 use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name};

 /// Extracts `routing_preferences` from a JSON body, returning the cleaned body bytes
@ -60,7 +60,7 @@ struct RoutingDecisionResponse {

 pub async fn routing_decision(
    request: Request<hyper::body::Incoming>,
-    router_service: Arc<RouterService>,
+    orchestrator_service: Arc<OrchestratorService>,
    request_path: String,
    span_attributes: &Option<SpanAttributes>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
@ -76,7 +76,7 @@ pub async fn routing_decision(
        .and_then(|h| h.to_str().ok())
        .map(|s| s.to_string());

-    let tenant_id: Option<String> = router_service
+    let tenant_id: Option<String> = orchestrator_service
        .tenant_header()
        .and_then(|hdr| request_headers.get(hdr))
        .and_then(|v| v.to_str().ok())
@ -94,7 +94,7 @@ pub async fn routing_decision(

    routing_decision_inner(
        request,
-        router_service,
+        orchestrator_service,
        request_id,
        request_path,
        request_headers,
@ -109,7 +109,7 @@ pub async fn routing_decision(
 #[allow(clippy::too_many_arguments)]
 async fn routing_decision_inner(
    request: Request<hyper::body::Incoming>,
-    router_service: Arc<RouterService>,
+    orchestrator_service: Arc<OrchestratorService>,
    request_id: String,
    request_path: String,
    request_headers: hyper::HeaderMap,
@ -133,9 +133,8 @@ async fn routing_decision_inner(
        .unwrap_or("unknown")
        .to_string();

-    // Session pinning: check cache before doing any routing work
    if let Some(ref sid) = session_id {
-        if let Some(cached) = router_service
+        if let Some(cached) = orchestrator_service
            .get_cached_route(sid, tenant_id.as_deref())
            .await
        {
@ -202,9 +201,8 @@ async fn routing_decision_inner(
    };

    let routing_result = router_chat_get_upstream_model(
-        Arc::clone(&router_service),
+        Arc::clone(&orchestrator_service),
        client_request,
-        &traceparent,
        &request_path,
        &request_id,
        inline_routing_preferences,
@ -213,9 +211,8 @@ async fn routing_decision_inner(

    match routing_result {
        Ok(result) => {
-            // Cache the result if session_id is present
            if let Some(ref sid) = session_id {
-                router_service
+                orchestrator_service
                    .cache_route(
                        sid.clone(),
                        tenant_id.as_deref(),
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -5,7 +5,6 @@ use brightstaff::handlers::function_calling::function_calling_chat_handler;
 use brightstaff::handlers::llm::llm_chat;
 use brightstaff::handlers::models::list_models;
 use brightstaff::handlers::routing_service::routing_decision;
-use brightstaff::router::llm::RouterService;
 use brightstaff::router::model_metrics::ModelMetricsService;
 use brightstaff::router::orchestrator::OrchestratorService;
 use brightstaff::session_cache::init_session_cache;
@ -37,8 +36,6 @@ use tokio::sync::RwLock;
 use tracing::{debug, info, warn};

 const BIND_ADDRESS: &str = "0.0.0.0:9091";
-const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router";
-const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router";
 const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator";
 const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";

@ -161,20 +158,6 @@ async fn init_app_state(

    let overrides = config.overrides.clone().unwrap_or_default();

-    let routing_model_name: String = overrides
-        .llm_routing_model
-        .as_deref()
-        .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
-        .unwrap_or(DEFAULT_ROUTING_MODEL_NAME)
-        .to_string();
-
-    let routing_llm_provider = config
-        .model_providers
-        .iter()
-        .find(|p| p.model.as_deref() == Some(routing_model_name.as_str()))
-        .map(|p| p.name.clone())
-        .unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string());
-
    let session_ttl_seconds = config.routing.as_ref().and_then(|r| r.session_ttl_seconds);
    let session_cache = init_session_cache(config).await?;

@ -304,20 +287,11 @@ async fn init_app_state(
        .and_then(|r| r.session_cache.as_ref())
        .and_then(|c| c.tenant_header.clone());

-    let router_service = Arc::new(RouterService::new(
-        config.routing_preferences.clone(),
-        metrics_service,
-        format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
-        routing_model_name,
-        routing_llm_provider,
-        session_ttl_seconds,
-        session_cache,
-        session_tenant_header,
-    ));
-
+    // Resolve model name: prefer llm_routing_model override, then agent_orchestration_model, then default.
    let orchestrator_model_name: String = overrides
-        .agent_orchestration_model
+        .llm_routing_model
        .as_deref()
+        .or(overrides.agent_orchestration_model.as_deref())
        .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
        .unwrap_or(DEFAULT_ORCHESTRATOR_MODEL_NAME)
        .to_string();
@ -329,10 +303,20 @@ async fn init_app_state(
        .map(|p| p.name.clone())
        .unwrap_or_else(|| DEFAULT_ORCHESTRATOR_LLM_PROVIDER.to_string());

-    let orchestrator_service = Arc::new(OrchestratorService::new(
+    let orchestrator_max_tokens = overrides
+        .orchestrator_model_context_length
+        .unwrap_or(brightstaff::router::orchestrator_model_v1::MAX_TOKEN_LEN);
+
+    let orchestrator_service = Arc::new(OrchestratorService::with_routing(
        format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
        orchestrator_model_name,
        orchestrator_llm_provider,
+        config.routing_preferences.clone(),
+        metrics_service,
+        session_ttl_seconds,
+        session_cache,
+        session_tenant_header,
+        orchestrator_max_tokens,
    ));

    let state_storage = init_state_storage(config).await?;
@ -343,7 +327,6 @@ async fn init_app_state(
        .and_then(|tracing| tracing.span_attributes.clone());

    Ok(AppState {
-        router_service,
        orchestrator_service,
        model_aliases: config.model_aliases.clone(),
        llm_providers: Arc::new(RwLock::new(llm_providers)),
@ -430,7 +413,7 @@ async fn route(
        ) {
            return routing_decision(
                req,
-                Arc::clone(&state.router_service),
+                Arc::clone(&state.orchestrator_service),
                stripped,
                &state.span_attributes,
            )
--- a/crates/brightstaff/src/router/llm.rs
+++ b/crates/brightstaff/src/router/llm.rs
@ -1,371 +0,0 @@
-use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration};
-
-use common::{
-    configuration::TopLevelRoutingPreference,
-    consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER, TRACE_PARENT_HEADER},
-};
-
-use super::router_model::{ModelUsagePreference, RoutingPreference};
-use hermesllm::apis::openai::Message;
-use hyper::header;
-use thiserror::Error;
-use tracing::{debug, info};
-
-use super::http::{self, post_and_extract_content};
-use super::model_metrics::ModelMetricsService;
-use super::router_model::RouterModel;
-
-use crate::router::router_model_v1;
-use crate::session_cache::SessionCache;
-
-pub use crate::session_cache::CachedRoute;
-
-const DEFAULT_SESSION_TTL_SECONDS: u64 = 600;
-
-pub struct RouterService {
-    router_url: String,
-    client: reqwest::Client,
-    router_model: Arc<dyn RouterModel>,
-    routing_provider_name: String,
-    top_level_preferences: HashMap<String, TopLevelRoutingPreference>,
-    metrics_service: Option<Arc<ModelMetricsService>>,
-    session_cache: Arc<dyn SessionCache>,
-    session_ttl: Duration,
-    tenant_header: Option<String>,
-}
-
-#[derive(Debug, Error)]
-pub enum RoutingError {
-    #[error(transparent)]
-    Http(#[from] http::HttpError),
-
-    #[error("Router model error: {0}")]
-    RouterModelError(#[from] super::router_model::RoutingModelError),
-}
-
-pub type Result<T> = std::result::Result<T, RoutingError>;
-
-impl RouterService {
-    #[allow(clippy::too_many_arguments)]
-    pub fn new(
-        top_level_prefs: Option<Vec<TopLevelRoutingPreference>>,
-        metrics_service: Option<Arc<ModelMetricsService>>,
-        router_url: String,
-        routing_model_name: String,
-        routing_provider_name: String,
-        session_ttl_seconds: Option<u64>,
-        session_cache: Arc<dyn SessionCache>,
-        tenant_header: Option<String>,
-    ) -> Self {
-        let top_level_preferences: HashMap<String, TopLevelRoutingPreference> = top_level_prefs
-            .map_or_else(HashMap::new, |prefs| {
-                prefs.into_iter().map(|p| (p.name.clone(), p)).collect()
-            });
-
-        // Build sentinel routes for RouterModelV1: route_name → first model.
-        // RouterModelV1 uses this to build its prompt; RouterService overrides
-        // the model selection via rank_models() after the route is determined.
-        let sentinel_routes: HashMap<String, Vec<RoutingPreference>> = top_level_preferences
-            .iter()
-            .filter_map(|(name, pref)| {
-                pref.models.first().map(|first_model| {
-                    (
-                        first_model.clone(),
-                        vec![RoutingPreference {
-                            name: name.clone(),
-                            description: pref.description.clone(),
-                        }],
-                    )
-                })
-            })
-            .collect();
-
-        let router_model = Arc::new(router_model_v1::RouterModelV1::new(
-            sentinel_routes,
-            routing_model_name,
-            router_model_v1::MAX_TOKEN_LEN,
-        ));
-
-        let session_ttl =
-            Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS));
-
-        RouterService {
-            router_url,
-            client: reqwest::Client::new(),
-            router_model,
-            routing_provider_name,
-            top_level_preferences,
-            metrics_service,
-            session_cache,
-            session_ttl,
-            tenant_header,
-        }
-    }
-
-    /// Name of the HTTP header used to scope cache keys by tenant, if configured.
-    #[must_use]
-    pub fn tenant_header(&self) -> Option<&str> {
-        self.tenant_header.as_deref()
-    }
-
-    /// Build the cache key, optionally scoped by tenant: `{tenant_id}:{session_id}` or `{session_id}`.
-    /// Returns a borrowed key when no tenant prefix is needed, avoiding an allocation.
-    fn session_key<'a>(tenant_id: Option<&str>, session_id: &'a str) -> Cow<'a, str> {
-        match tenant_id {
-            Some(t) => Cow::Owned(format!("{t}:{session_id}")),
-            None => Cow::Borrowed(session_id),
-        }
-    }
-
-    /// Look up a cached routing decision by session ID.
-    /// Returns None if not found or expired.
-    pub async fn get_cached_route(
-        &self,
-        session_id: &str,
-        tenant_id: Option<&str>,
-    ) -> Option<CachedRoute> {
-        self.session_cache
-            .get(&Self::session_key(tenant_id, session_id))
-            .await
-    }
-
-    /// Store a routing decision in the session cache.
-    pub async fn cache_route(
-        &self,
-        session_id: String,
-        tenant_id: Option<&str>,
-        model_name: String,
-        route_name: Option<String>,
-    ) {
-        self.session_cache
-            .put(
-                &Self::session_key(tenant_id, &session_id),
-                CachedRoute {
-                    model_name,
-                    route_name,
-                },
-                self.session_ttl,
-            )
-            .await;
-    }
-
-    pub async fn determine_route(
-        &self,
-        messages: &[Message],
-        traceparent: &str,
-        inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
-        request_id: &str,
-    ) -> Result<Option<(String, Vec<String>)>> {
-        if messages.is_empty() {
-            return Ok(None);
-        }
-
-        // Build inline top-level map from request if present (inline overrides config).
-        let inline_top_map: Option<HashMap<String, TopLevelRoutingPreference>> =
-            inline_routing_preferences
-                .map(|prefs| prefs.into_iter().map(|p| (p.name.clone(), p)).collect());
-
-        // No routing defined — skip the router call entirely.
-        if inline_top_map.is_none() && self.top_level_preferences.is_empty() {
-            return Ok(None);
-        }
-
-        // For inline overrides, build synthetic ModelUsagePreference list so RouterModelV1
-        // generates the correct prompt (route name + description pairs).
-        // For config-level prefs the sentinel routes are already baked into RouterModelV1.
-        let effective_usage_preferences: Option<Vec<ModelUsagePreference>> =
-            inline_top_map.as_ref().map(|inline_map| {
-                inline_map
-                    .values()
-                    .map(|p| ModelUsagePreference {
-                        model: p.models.first().cloned().unwrap_or_default(),
-                        routing_preferences: vec![RoutingPreference {
-                            name: p.name.clone(),
-                            description: p.description.clone(),
-                        }],
-                    })
-                    .collect()
-            });
-
-        let router_request = self
-            .router_model
-            .generate_request(messages, &effective_usage_preferences);
-
-        debug!(
-            model = %self.router_model.get_model_name(),
-            endpoint = %self.router_url,
-            "sending request to arch-router"
-        );
-
-        let body = serde_json::to_string(&router_request)
-            .map_err(super::router_model::RoutingModelError::from)?;
-        debug!(body = %body, "arch router request");
-
-        let mut headers = header::HeaderMap::new();
-        headers.insert(
-            header::CONTENT_TYPE,
-            header::HeaderValue::from_static("application/json"),
-        );
-        if let Ok(val) = header::HeaderValue::from_str(&self.routing_provider_name) {
-            headers.insert(
-                header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER),
-                val,
-            );
-        }
-        if let Ok(val) = header::HeaderValue::from_str(traceparent) {
-            headers.insert(header::HeaderName::from_static(TRACE_PARENT_HEADER), val);
-        }
-        if let Ok(val) = header::HeaderValue::from_str(request_id) {
-            headers.insert(header::HeaderName::from_static(REQUEST_ID_HEADER), val);
-        }
-        headers.insert(
-            header::HeaderName::from_static("model"),
-            header::HeaderValue::from_static("arch-router"),
-        );
-
-        let Some((content, elapsed)) =
-            post_and_extract_content(&self.client, &self.router_url, headers, body).await?
-        else {
-            return Ok(None);
-        };
-
-        // Parse the route name from the router response.
-        let parsed = self
-            .router_model
-            .parse_response(&content, &effective_usage_preferences)?;
-
-        let result = if let Some((route_name, _sentinel)) = parsed {
-            let top_pref = inline_top_map
-                .as_ref()
-                .and_then(|m| m.get(&route_name))
-                .or_else(|| self.top_level_preferences.get(&route_name));
-
-            if let Some(pref) = top_pref {
-                let ranked = match &self.metrics_service {
-                    Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await,
-                    None => pref.models.clone(),
-                };
-                Some((route_name, ranked))
-            } else {
-                None
-            }
-        } else {
-            None
-        };
-
-        info!(
-            content = %content.replace("\n", "\\n"),
-            selected_model = ?result,
-            response_time_ms = elapsed.as_millis(),
-            "arch-router determined route"
-        );
-
-        Ok(result)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::session_cache::memory::MemorySessionCache;
-
-    fn make_router_service(ttl_seconds: u64, max_entries: usize) -> RouterService {
-        let session_cache = Arc::new(MemorySessionCache::new(max_entries));
-        RouterService::new(
-            None,
-            None,
-            "http://localhost:12001/v1/chat/completions".to_string(),
-            "Arch-Router".to_string(),
-            "arch-router".to_string(),
-            Some(ttl_seconds),
-            session_cache,
-            None,
-        )
-    }
-
-    #[tokio::test]
-    async fn test_cache_miss_returns_none() {
-        let svc = make_router_service(600, 100);
-        assert!(svc
-            .get_cached_route("unknown-session", None)
-            .await
-            .is_none());
-    }
-
-    #[tokio::test]
-    async fn test_cache_hit_returns_cached_route() {
-        let svc = make_router_service(600, 100);
-        svc.cache_route(
-            "s1".to_string(),
-            None,
-            "gpt-4o".to_string(),
-            Some("code".to_string()),
-        )
-        .await;
-
-        let cached = svc.get_cached_route("s1", None).await.unwrap();
-        assert_eq!(cached.model_name, "gpt-4o");
-        assert_eq!(cached.route_name, Some("code".to_string()));
-    }
-
-    #[tokio::test]
-    async fn test_cache_expired_entry_returns_none() {
-        let svc = make_router_service(0, 100);
-        svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
-            .await;
-        assert!(svc.get_cached_route("s1", None).await.is_none());
-    }
-
-    #[tokio::test]
-    async fn test_expired_entries_not_returned() {
-        let svc = make_router_service(0, 100);
-        svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
-            .await;
-        svc.cache_route("s2".to_string(), None, "claude".to_string(), None)
-            .await;
-
-        // Entries with TTL=0 should be expired immediately
-        assert!(svc.get_cached_route("s1", None).await.is_none());
-        assert!(svc.get_cached_route("s2", None).await.is_none());
-    }
-
-    #[tokio::test]
-    async fn test_cache_evicts_oldest_when_full() {
-        let svc = make_router_service(600, 2);
-        svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
-            .await;
-        tokio::time::sleep(Duration::from_millis(10)).await;
-        svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
-            .await;
-
-        svc.cache_route("s3".to_string(), None, "model-c".to_string(), None)
-            .await;
-
-        // s1 should be evicted (oldest); s2 and s3 should remain
-        assert!(svc.get_cached_route("s1", None).await.is_none());
-        assert!(svc.get_cached_route("s2", None).await.is_some());
-        assert!(svc.get_cached_route("s3", None).await.is_some());
-    }
-
-    #[tokio::test]
-    async fn test_cache_update_existing_session_does_not_evict() {
-        let svc = make_router_service(600, 2);
-        svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
-            .await;
-        svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
-            .await;
-
-        svc.cache_route(
-            "s1".to_string(),
-            None,
-            "model-a-updated".to_string(),
-            Some("route".to_string()),
-        )
-        .await;
-
-        // Both sessions should still be present
-        let s1 = svc.get_cached_route("s1", None).await.unwrap();
-        assert_eq!(s1.model_name, "model-a-updated");
-        assert!(svc.get_cached_route("s2", None).await.is_some());
-    }
-}
--- a/crates/brightstaff/src/router/mod.rs
+++ b/crates/brightstaff/src/router/mod.rs
@ -1,8 +1,5 @@
 pub(crate) mod http;
-pub mod llm;
 pub mod model_metrics;
 pub mod orchestrator;
 pub mod orchestrator_model;
 pub mod orchestrator_model_v1;
-pub mod router_model;
-pub mod router_model_v1;
--- a/crates/brightstaff/src/router/orchestrator.rs
+++ b/crates/brightstaff/src/router/orchestrator.rs
@ -1,7 +1,7 @@
-use std::{collections::HashMap, sync::Arc};
+use std::{borrow::Cow, collections::HashMap, sync::Arc, time::Duration};

 use common::{
-    configuration::{AgentUsagePreference, OrchestrationPreference},
+    configuration::{AgentUsagePreference, OrchestrationPreference, TopLevelRoutingPreference},
    consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER},
 };
 use hermesllm::apis::openai::Message;
@ -12,15 +12,26 @@ use thiserror::Error;
 use tracing::{debug, info};

 use super::http::{self, post_and_extract_content};
+use super::model_metrics::ModelMetricsService;
 use super::orchestrator_model::OrchestratorModel;

 use crate::router::orchestrator_model_v1;
+use crate::session_cache::SessionCache;
+
+pub use crate::session_cache::CachedRoute;
+
+const DEFAULT_SESSION_TTL_SECONDS: u64 = 600;

 pub struct OrchestratorService {
    orchestrator_url: String,
    client: reqwest::Client,
    orchestrator_model: Arc<dyn OrchestratorModel>,
    orchestrator_provider_name: String,
+    top_level_preferences: HashMap<String, TopLevelRoutingPreference>,
+    metrics_service: Option<Arc<ModelMetricsService>>,
+    session_cache: Option<Arc<dyn SessionCache>>,
+    session_ttl: Duration,
+    tenant_header: Option<String>,
 }

 #[derive(Debug, Error)]
@ -39,13 +50,12 @@ impl OrchestratorService {
        orchestrator_url: String,
        orchestration_model_name: String,
        orchestrator_provider_name: String,
+        max_token_length: usize,
    ) -> Self {
-        let agent_orchestrations: HashMap<String, Vec<OrchestrationPreference>> = HashMap::new();
-
        let orchestrator_model = Arc::new(orchestrator_model_v1::OrchestratorModelV1::new(
-            agent_orchestrations,
-            orchestration_model_name.clone(),
-            orchestrator_model_v1::MAX_TOKEN_LEN,
+            HashMap::new(),
+            orchestration_model_name,
+            max_token_length,
        ));

        OrchestratorService {
@ -53,9 +63,182 @@ impl OrchestratorService {
            client: reqwest::Client::new(),
            orchestrator_model,
            orchestrator_provider_name,
+            top_level_preferences: HashMap::new(),
+            metrics_service: None,
+            session_cache: None,
+            session_ttl: Duration::from_secs(DEFAULT_SESSION_TTL_SECONDS),
+            tenant_header: None,
        }
    }

+    #[allow(clippy::too_many_arguments)]
+    pub fn with_routing(
+        orchestrator_url: String,
+        orchestration_model_name: String,
+        orchestrator_provider_name: String,
+        top_level_prefs: Option<Vec<TopLevelRoutingPreference>>,
+        metrics_service: Option<Arc<ModelMetricsService>>,
+        session_ttl_seconds: Option<u64>,
+        session_cache: Arc<dyn SessionCache>,
+        tenant_header: Option<String>,
+        max_token_length: usize,
+    ) -> Self {
+        let top_level_preferences: HashMap<String, TopLevelRoutingPreference> = top_level_prefs
+            .map_or_else(HashMap::new, |prefs| {
+                prefs.into_iter().map(|p| (p.name.clone(), p)).collect()
+            });
+
+        let orchestrator_model = Arc::new(orchestrator_model_v1::OrchestratorModelV1::new(
+            HashMap::new(),
+            orchestration_model_name,
+            max_token_length,
+        ));
+
+        let session_ttl =
+            Duration::from_secs(session_ttl_seconds.unwrap_or(DEFAULT_SESSION_TTL_SECONDS));
+
+        OrchestratorService {
+            orchestrator_url,
+            client: reqwest::Client::new(),
+            orchestrator_model,
+            orchestrator_provider_name,
+            top_level_preferences,
+            metrics_service,
+            session_cache: Some(session_cache),
+            session_ttl,
+            tenant_header,
+        }
+    }
+
+    // ---- Session cache methods ----
+
+    #[must_use]
+    pub fn tenant_header(&self) -> Option<&str> {
+        self.tenant_header.as_deref()
+    }
+
+    fn session_key<'a>(tenant_id: Option<&str>, session_id: &'a str) -> Cow<'a, str> {
+        match tenant_id {
+            Some(t) => Cow::Owned(format!("{t}:{session_id}")),
+            None => Cow::Borrowed(session_id),
+        }
+    }
+
+    pub async fn get_cached_route(
+        &self,
+        session_id: &str,
+        tenant_id: Option<&str>,
+    ) -> Option<CachedRoute> {
+        let cache = self.session_cache.as_ref()?;
+        cache.get(&Self::session_key(tenant_id, session_id)).await
+    }
+
+    pub async fn cache_route(
+        &self,
+        session_id: String,
+        tenant_id: Option<&str>,
+        model_name: String,
+        route_name: Option<String>,
+    ) {
+        if let Some(ref cache) = self.session_cache {
+            cache
+                .put(
+                    &Self::session_key(tenant_id, &session_id),
+                    CachedRoute {
+                        model_name,
+                        route_name,
+                    },
+                    self.session_ttl,
+                )
+                .await;
+        }
+    }
+
+    // ---- LLM routing ----
+
+    pub async fn determine_route(
+        &self,
+        messages: &[Message],
+        inline_routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
+        request_id: &str,
+    ) -> Result<Option<(String, Vec<String>)>> {
+        if messages.is_empty() {
+            return Ok(None);
+        }
+
+        let inline_top_map: Option<HashMap<String, TopLevelRoutingPreference>> =
+            inline_routing_preferences
+                .map(|prefs| prefs.into_iter().map(|p| (p.name.clone(), p)).collect());
+
+        if inline_top_map.is_none() && self.top_level_preferences.is_empty() {
+            return Ok(None);
+        }
+
+        let effective_source = inline_top_map
+            .as_ref()
+            .unwrap_or(&self.top_level_preferences);
+
+        let effective_prefs: Vec<AgentUsagePreference> = effective_source
+            .values()
+            .map(|p| AgentUsagePreference {
+                model: p.models.first().cloned().unwrap_or_default(),
+                orchestration_preferences: vec![OrchestrationPreference {
+                    name: p.name.clone(),
+                    description: p.description.clone(),
+                }],
+            })
+            .collect();
+
+        let orchestration_result = self
+            .determine_orchestration(
+                messages,
+                Some(effective_prefs),
+                Some(request_id.to_string()),
+            )
+            .await?;
+
+        let result = if let Some(ref routes) = orchestration_result {
+            if routes.len() > 1 {
+                let all_routes: Vec<&str> = routes.iter().map(|(name, _)| name.as_str()).collect();
+                info!(
+                    routes = ?all_routes,
+                    using = %all_routes.first().unwrap_or(&"none"),
+                    "plano-orchestrator detected multiple intents, using first"
+                );
+            }
+
+            if let Some((route_name, _)) = routes.first() {
+                let top_pref = inline_top_map
+                    .as_ref()
+                    .and_then(|m| m.get(route_name))
+                    .or_else(|| self.top_level_preferences.get(route_name));
+
+                if let Some(pref) = top_pref {
+                    let ranked = match &self.metrics_service {
+                        Some(svc) => svc.rank_models(&pref.models, &pref.selection_policy).await,
+                        None => pref.models.clone(),
+                    };
+                    Some((route_name.clone(), ranked))
+                } else {
+                    None
+                }
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        info!(
+            selected_model = ?result,
+            "plano-orchestrator determined route"
+        );
+
+        Ok(result)
+    }
+
+    // ---- Agent orchestration (existing) ----
+
    pub async fn determine_orchestration(
        &self,
        messages: &[Message],
@ -80,12 +263,12 @@ impl OrchestratorService {
        debug!(
            model = %self.orchestrator_model.get_model_name(),
            endpoint = %self.orchestrator_url,
-            "sending request to arch-orchestrator"
+            "sending request to plano-orchestrator"
        );

        let body = serde_json::to_string(&orchestrator_request)
            .map_err(super::orchestrator_model::OrchestratorModelError::from)?;
-        debug!(body = %body, "arch orchestrator request");
+        debug!(body = %body, "plano-orchestrator request");

        let mut headers = header::HeaderMap::new();
        headers.insert(
@ -98,7 +281,6 @@ impl OrchestratorService {
                .unwrap_or_else(|_| header::HeaderValue::from_static("plano-orchestrator")),
        );

-        // Inject OpenTelemetry trace context from current span
        global::get_text_map_propagator(|propagator| {
            let cx =
                tracing_opentelemetry::OpenTelemetrySpanExt::context(&tracing::Span::current());
@ -130,9 +312,113 @@ impl OrchestratorService {
            content = %content.replace("\n", "\\n"),
            selected_routes = ?parsed,
            response_time_ms = elapsed.as_millis(),
-            "arch-orchestrator determined routes"
+            "plano-orchestrator determined routes"
        );

        Ok(parsed)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::session_cache::memory::MemorySessionCache;
+
+    fn make_orchestrator_service(ttl_seconds: u64, max_entries: usize) -> OrchestratorService {
+        let session_cache = Arc::new(MemorySessionCache::new(max_entries));
+        OrchestratorService::with_routing(
+            "http://localhost:12001/v1/chat/completions".to_string(),
+            "Plano-Orchestrator".to_string(),
+            "plano-orchestrator".to_string(),
+            None,
+            None,
+            Some(ttl_seconds),
+            session_cache,
+            None,
+            orchestrator_model_v1::MAX_TOKEN_LEN,
+        )
+    }
+
+    #[tokio::test]
+    async fn test_cache_miss_returns_none() {
+        let svc = make_orchestrator_service(600, 100);
+        assert!(svc
+            .get_cached_route("unknown-session", None)
+            .await
+            .is_none());
+    }
+
+    #[tokio::test]
+    async fn test_cache_hit_returns_cached_route() {
+        let svc = make_orchestrator_service(600, 100);
+        svc.cache_route(
+            "s1".to_string(),
+            None,
+            "gpt-4o".to_string(),
+            Some("code".to_string()),
+        )
+        .await;
+
+        let cached = svc.get_cached_route("s1", None).await.unwrap();
+        assert_eq!(cached.model_name, "gpt-4o");
+        assert_eq!(cached.route_name, Some("code".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_cache_expired_entry_returns_none() {
+        let svc = make_orchestrator_service(0, 100);
+        svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
+            .await;
+        assert!(svc.get_cached_route("s1", None).await.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_expired_entries_not_returned() {
+        let svc = make_orchestrator_service(0, 100);
+        svc.cache_route("s1".to_string(), None, "gpt-4o".to_string(), None)
+            .await;
+        svc.cache_route("s2".to_string(), None, "claude".to_string(), None)
+            .await;
+
+        assert!(svc.get_cached_route("s1", None).await.is_none());
+        assert!(svc.get_cached_route("s2", None).await.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_cache_evicts_oldest_when_full() {
+        let svc = make_orchestrator_service(600, 2);
+        svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
+            .await;
+        tokio::time::sleep(Duration::from_millis(10)).await;
+        svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
+            .await;
+
+        svc.cache_route("s3".to_string(), None, "model-c".to_string(), None)
+            .await;
+
+        assert!(svc.get_cached_route("s1", None).await.is_none());
+        assert!(svc.get_cached_route("s2", None).await.is_some());
+        assert!(svc.get_cached_route("s3", None).await.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_cache_update_existing_session_does_not_evict() {
+        let svc = make_orchestrator_service(600, 2);
+        svc.cache_route("s1".to_string(), None, "model-a".to_string(), None)
+            .await;
+        svc.cache_route("s2".to_string(), None, "model-b".to_string(), None)
+            .await;
+
+        svc.cache_route(
+            "s1".to_string(),
+            None,
+            "model-a-updated".to_string(),
+            Some("route".to_string()),
+        )
+        .await;
+
+        let s1 = svc.get_cached_route("s1", None).await.unwrap();
+        assert_eq!(s1.model_name, "model-a-updated");
+        assert!(svc.get_cached_route("s2", None).await.is_some());
+    }
+}
--- a/crates/brightstaff/src/router/orchestrator_model.rs
+++ b/crates/brightstaff/src/router/orchestrator_model.rs
@ -11,8 +11,7 @@ pub enum OrchestratorModelError {
 pub type Result<T> = std::result::Result<T, OrchestratorModelError>;

 /// OrchestratorModel trait for handling orchestration requests.
-/// Unlike RouterModel which returns a single route, OrchestratorModel
-/// can return multiple routes as the model output format is:
+/// Returns multiple routes as the model output format is:
 /// {"route": ["route_name_1", "route_name_2", ...]}
 pub trait OrchestratorModel: Send + Sync {
    fn generate_request(
--- a/crates/brightstaff/src/router/orchestrator_model_v1.rs
+++ b/crates/brightstaff/src/router/orchestrator_model_v1.rs
@ -8,7 +8,7 @@ use tracing::{debug, warn};

 use super::orchestrator_model::{OrchestratorModel, OrchestratorModelError};

-pub const MAX_TOKEN_LEN: usize = 2048; // Default max token length for the orchestration model
+pub const MAX_TOKEN_LEN: usize = 8192; // Default max token length for the orchestration model

 /// Custom JSON formatter that produces spaced JSON (space after colons and commas), same as JSON in python
 struct SpacedJsonFormatter;
--- a/crates/brightstaff/src/router/router_model.rs
+++ b/crates/brightstaff/src/router/router_model.rs
@ -1,39 +0,0 @@
-use hermesllm::apis::openai::{ChatCompletionsRequest, Message};
-use serde::{Deserialize, Serialize};
-use thiserror::Error;
-
-#[derive(Debug, Error)]
-pub enum RoutingModelError {
-    #[error("Failed to parse JSON: {0}")]
-    JsonError(#[from] serde_json::Error),
-}
-
-pub type Result<T> = std::result::Result<T, RoutingModelError>;
-
-/// Internal route descriptor passed to the router model to build its prompt.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct RoutingPreference {
-    pub name: String,
-    pub description: String,
-}
-
-/// Groups a model with its routing preferences (used internally by RouterModelV1).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ModelUsagePreference {
-    pub model: String,
-    pub routing_preferences: Vec<RoutingPreference>,
-}
-
-pub trait RouterModel: Send + Sync {
-    fn generate_request(
-        &self,
-        messages: &[Message],
-        usage_preferences: &Option<Vec<ModelUsagePreference>>,
-    ) -> ChatCompletionsRequest;
-    fn parse_response(
-        &self,
-        content: &str,
-        usage_preferences: &Option<Vec<ModelUsagePreference>>,
-    ) -> Result<Option<(String, String)>>;
-    fn get_model_name(&self) -> String;
-}
--- a/crates/brightstaff/src/router/router_model_v1.rs
+++ b/crates/brightstaff/src/router/router_model_v1.rs
@ -1,842 +0,0 @@
-use std::collections::HashMap;
-
-use super::router_model::{ModelUsagePreference, RoutingPreference};
-use hermesllm::apis::openai::{ChatCompletionsRequest, Message, MessageContent, Role};
-use hermesllm::transforms::lib::ExtractText;
-use serde::{Deserialize, Serialize};
-use tracing::{debug, warn};
-
-use super::router_model::{RouterModel, RoutingModelError};
-
-pub const MAX_TOKEN_LEN: usize = 2048; // Default max token length for the routing model
-pub const ARCH_ROUTER_V1_SYSTEM_PROMPT: &str = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-{routes}
-</routes>
-
-<conversation>
-{conversation}
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-
-pub type Result<T> = std::result::Result<T, RoutingModelError>;
-pub struct RouterModelV1 {
-    llm_route_json_str: String,
-    llm_route_to_model_map: HashMap<String, String>,
-    routing_model: String,
-    max_token_length: usize,
-}
-impl RouterModelV1 {
-    pub fn new(
-        llm_routes: HashMap<String, Vec<RoutingPreference>>,
-        routing_model: String,
-        max_token_length: usize,
-    ) -> Self {
-        let llm_route_values: Vec<RoutingPreference> =
-            llm_routes.values().flatten().cloned().collect();
-        let llm_route_json_str =
-            serde_json::to_string(&llm_route_values).unwrap_or_else(|_| "[]".to_string());
-        let llm_route_to_model_map: HashMap<String, String> = llm_routes
-            .iter()
-            .flat_map(|(model, prefs)| prefs.iter().map(|pref| (pref.name.clone(), model.clone())))
-            .collect();
-
-        RouterModelV1 {
-            routing_model,
-            max_token_length,
-            llm_route_json_str,
-            llm_route_to_model_map,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct LlmRouterResponse {
-    pub route: Option<String>,
-}
-
-const TOKEN_LENGTH_DIVISOR: usize = 4; // Approximate token length divisor for UTF-8 characters
-
-impl RouterModel for RouterModelV1 {
-    fn generate_request(
-        &self,
-        messages: &[Message],
-        usage_preferences_from_request: &Option<Vec<ModelUsagePreference>>,
-    ) -> ChatCompletionsRequest {
-        // remove system prompt, tool calls, tool call response and messages without content
-        // if content is empty its likely a tool call
-        // when role == tool its tool call response
-        let messages_vec = messages
-            .iter()
-            .filter(|m| {
-                m.role != Role::System
-                    && m.role != Role::Developer
-                    && m.role != Role::Tool
-                    && !m.content.extract_text().is_empty()
-            })
-            .collect::<Vec<&Message>>();
-
-        // Following code is to ensure that the conversation does not exceed max token length
-        // Note: we use a simple heuristic to estimate token count based on character length to optimize for performance
-        let mut token_count = ARCH_ROUTER_V1_SYSTEM_PROMPT.len() / TOKEN_LENGTH_DIVISOR;
-        let mut selected_messages_list_reversed: Vec<&Message> = vec![];
-        for (selected_messsage_count, message) in messages_vec.iter().rev().enumerate() {
-            let message_token_count = message.content.extract_text().len() / TOKEN_LENGTH_DIVISOR;
-            token_count += message_token_count;
-            if token_count > self.max_token_length {
-                debug!(
-                    token_count = token_count,
-                    max_tokens = self.max_token_length,
-                    selected = selected_messsage_count,
-                    total = messages_vec.len(),
-                    "token count exceeds max, truncating conversation"
-                );
-                if message.role == Role::User {
-                    // If message that exceeds max token length is from user, we need to keep it
-                    selected_messages_list_reversed.push(message);
-                }
-                break;
-            }
-            // If we are here, it means that the message is within the max token length
-            selected_messages_list_reversed.push(message);
-        }
-
-        if selected_messages_list_reversed.is_empty() {
-            debug!("no messages selected, using last message");
-            if let Some(last_message) = messages_vec.last() {
-                selected_messages_list_reversed.push(last_message);
-            }
-        }
-
-        // ensure that first and last selected message is from user
-        if let Some(first_message) = selected_messages_list_reversed.first() {
-            if first_message.role != Role::User {
-                warn!("last message is not from user, may lead to incorrect routing");
-            }
-        }
-        if let Some(last_message) = selected_messages_list_reversed.last() {
-            if last_message.role != Role::User {
-                warn!("first message is not from user, may lead to incorrect routing");
-            }
-        }
-
-        // Reverse the selected messages to maintain the conversation order
-        let selected_conversation_list = selected_messages_list_reversed
-            .iter()
-            .rev()
-            .map(|message| {
-                Message {
-                    role: message.role.clone(),
-                    // we can unwrap here because we have already filtered out messages without content
-                    content: Some(MessageContent::Text(
-                        message
-                            .content
-                            .as_ref()
-                            .map_or(String::new(), |c| c.to_string()),
-                    )),
-                    name: None,
-                    tool_calls: None,
-                    tool_call_id: None,
-                }
-            })
-            .collect::<Vec<Message>>();
-
-        // Generate the router request message based on the usage preferences.
-        // If preferences are passed in request then we use them otherwise we use the default routing model preferences.
-        let router_message = match convert_to_router_preferences(usage_preferences_from_request) {
-            Some(prefs) => generate_router_message(&prefs, &selected_conversation_list),
-            None => generate_router_message(&self.llm_route_json_str, &selected_conversation_list),
-        };
-
-        ChatCompletionsRequest {
-            model: self.routing_model.clone(),
-            messages: vec![Message {
-                content: Some(MessageContent::Text(router_message)),
-                role: Role::User,
-                name: None,
-                tool_calls: None,
-                tool_call_id: None,
-            }],
-            temperature: Some(0.01),
-            ..Default::default()
-        }
-    }
-
-    fn parse_response(
-        &self,
-        content: &str,
-        usage_preferences: &Option<Vec<ModelUsagePreference>>,
-    ) -> Result<Option<(String, String)>> {
-        if content.is_empty() {
-            return Ok(None);
-        }
-        let router_resp_fixed = fix_json_response(content);
-        let router_response: LlmRouterResponse = serde_json::from_str(router_resp_fixed.as_str())?;
-
-        let selected_route = router_response.route.unwrap_or_default().to_string();
-
-        if selected_route.is_empty() || selected_route == "other" {
-            return Ok(None);
-        }
-
-        if let Some(usage_preferences) = usage_preferences {
-            // If usage preferences are defined, we need to find the model that matches the selected route
-            let model_name: Option<String> = usage_preferences
-                .iter()
-                .map(|pref| {
-                    pref.routing_preferences
-                        .iter()
-                        .find(|routing_pref| routing_pref.name == selected_route)
-                        .map(|_| pref.model.clone())
-                })
-                .find_map(|model| model);
-
-            if let Some(model_name) = model_name {
-                return Ok(Some((selected_route, model_name)));
-            } else {
-                warn!(
-                    route = %selected_route,
-                    preferences = ?usage_preferences,
-                    "no matching model found for route"
-                );
-                return Ok(None);
-            }
-        }
-
-        // If no usage preferences are passed in request then use the default routing model preferences
-        if let Some(model) = self.llm_route_to_model_map.get(&selected_route).cloned() {
-            return Ok(Some((selected_route, model)));
-        }
-
-        warn!(
-            route = %selected_route,
-            preferences = ?self.llm_route_to_model_map,
-            "no model found for route"
-        );
-
-        Ok(None)
-    }
-
-    fn get_model_name(&self) -> String {
-        self.routing_model.clone()
-    }
-}
-
-fn generate_router_message(prefs: &str, selected_conversation_list: &Vec<Message>) -> String {
-    ARCH_ROUTER_V1_SYSTEM_PROMPT
-        .replace("{routes}", prefs)
-        .replace(
-            "{conversation}",
-            &serde_json::to_string(&selected_conversation_list).unwrap_or_default(),
-        )
-}
-
-fn convert_to_router_preferences(
-    prefs_from_request: &Option<Vec<ModelUsagePreference>>,
-) -> Option<String> {
-    if let Some(usage_preferences) = prefs_from_request {
-        let routing_preferences = usage_preferences
-            .iter()
-            .flat_map(|pref| {
-                pref.routing_preferences
-                    .iter()
-                    .map(|routing_pref| RoutingPreference {
-                        name: routing_pref.name.clone(),
-                        description: routing_pref.description.clone(),
-                    })
-            })
-            .collect::<Vec<RoutingPreference>>();
-
-        return Some(serde_json::to_string(&routing_preferences).unwrap_or_default());
-    }
-
-    None
-}
-
-fn fix_json_response(body: &str) -> String {
-    let mut updated_body = body.to_string();
-
-    updated_body = updated_body.replace("'", "\"");
-
-    if updated_body.contains("\\n") {
-        updated_body = updated_body.replace("\\n", "");
-    }
-
-    if updated_body.starts_with("```json") {
-        updated_body = updated_body
-            .strip_prefix("```json")
-            .unwrap_or(&updated_body)
-            .to_string();
-    }
-
-    if updated_body.ends_with("```") {
-        updated_body = updated_body
-            .strip_suffix("```")
-            .unwrap_or(&updated_body)
-            .to_string();
-    }
-
-    updated_body
-}
-
-impl std::fmt::Debug for dyn RouterModel {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "RouterModel")
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use pretty_assertions::assert_eq;
-
-    #[test]
-    fn test_system_prompt_format() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
-                        }
-                    ]
-        "#;
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_system_prompt_format_usage_preferences() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"code-generation","description":"generating new code snippets, functions, or boilerplate based on user prompts or requirements"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
-                        }
-                    ]
-        "#;
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let usage_preferences = Some(vec![ModelUsagePreference {
-            model: "claude/claude-3-7-sonnet".to_string(),
-            routing_preferences: vec![RoutingPreference {
-                name: "code-generation".to_string(),
-                description: "generating new code snippets, functions, or boilerplate based on user prompts or requirements".to_string(),
-            }],
-        }]);
-        let req = router.generate_request(&conversation, &usage_preferences);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_conversation_exceed_token_count() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, 235);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
-                        }
-                    ]
-        "#;
-
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_conversation_exceed_token_count_large_single_message() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing."}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, 200);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson and this is a very long message that exceeds the max token length of the routing model, so it should be truncated and only the last user message should be included in the conversation for routing."
-                        }
-                    ]
-        "#;
-
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_conversation_trim_upto_user_message() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"given the image In style of Andy Warhol"},{"role":"assistant","content":"ok here is the image"},{"role":"user","content":"pls give me another image about Bart and Lisa"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, 230);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": "hi"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol"
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "ok here is the image"
-                        },
-                        {
-                            "role": "user",
-                            "content": "pls give me another image about Bart and Lisa"
-                        }
-                    ]
-        "#;
-
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_non_text_input() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"hi"},{"role":"assistant","content":"Hello! How can I assist you today?"},{"role":"user","content":"given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
-
-        let conversation_str = r#"
-                    [
-                        {
-                            "role": "user",
-                            "content": [
-                              {
-                                "type": "text",
-                                "text": "hi"
-                              },
-                              {
-                                "type": "image_url",
-                                "image_url": {
-                                  "url": "https://example.com/image.png"
-                                }
-                              }
-                            ]
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "Hello! How can I assist you today?"
-                        },
-                        {
-                            "role": "user",
-                            "content": "given the image In style of Andy Warhol, portrait of Bart and Lisa Simpson"
-                        }
-                    ]
-        "#;
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_skip_tool_call() {
-        let expected_prompt = r#"
-You are a helpful assistant designed to find the best suited route.
-You are provided with route description within <routes></routes> XML tags:
-<routes>
-[{"name":"Image generation","description":"generating image"}]
-</routes>
-
-<conversation>
-[{"role":"user","content":"What's the weather like in Tokyo?"},{"role":"assistant","content":"The current weather in Tokyo is 22°C and sunny."},{"role":"user","content":"What about in New York?"}]
-</conversation>
-
-Your task is to decide which route is best suit with user intent on the conversation in <conversation></conversation> XML tags.  Follow the instruction:
-1. If the latest intent from user is irrelevant or user intent is full filled, response with other route {"route": "other"}.
-2. You must analyze the route descriptions and find the best match route for user latest intent.
-3. You only response the name of the route that best matches the user's request, use the exact name in the <routes></routes>.
-
-Based on your analysis, provide your response in the following JSON formats if you decide to match any route:
-{"route": "route_name"}
-"#;
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-        let routing_model = "test-model".to_string();
-        let router = RouterModelV1::new(llm_routes, routing_model, usize::MAX);
-
-        let conversation_str = r#"
-                                                [
-                                                  {
-                                                    "role": "user",
-                                                    "content": "What's the weather like in Tokyo?"
-                                                  },
-                                                  {
-                                                    "role": "assistant",
-                                                    "content": "",
-                                                    "tool_calls": [
-                                                      {
-                                                        "id": "toolcall-abc123",
-                                                        "type": "function",
-                                                        "function": {
-                                                          "name": "get_weather",
-                                                          "arguments": "{ \"location\": \"Tokyo\" }"
-                                                        }
-                                                      }
-                                                    ]
-                                                  },
-                                                  {
-                                                    "role": "tool",
-                                                    "tool_call_id": "toolcall-abc123",
-                                                    "content": "{ \"temperature\": \"22°C\", \"condition\": \"Sunny\" }"
-                                                  },
-                                                  {
-                                                    "role": "assistant",
-                                                    "content": "The current weather in Tokyo is 22°C and sunny."
-                                                  },
-                                                  {
-                                                    "role": "user",
-                                                    "content": "What about in New York?"
-                                                  }
-                                                ]
-        "#;
-
-        // expects conversation to look like this
-
-        // [
-        //   {
-        //     "role": "user",
-        //     "content": "What's the weather like in Tokyo?"
-        //   },
-        //   {
-        //     "role": "assistant",
-        //     "content": "The current weather in Tokyo is 22°C and sunny."
-        //   },
-        //   {
-        //     "role": "user",
-        //     "content": "What about in New York?"
-        //   }
-        // ]
-
-        let conversation: Vec<Message> = serde_json::from_str(conversation_str).unwrap();
-
-        let req: ChatCompletionsRequest = router.generate_request(&conversation, &None);
-
-        let prompt = req.messages[0].content.extract_text();
-
-        assert_eq!(expected_prompt, prompt);
-    }
-
-    #[test]
-    fn test_parse_response() {
-        let routes_str = r#"
-          {
-            "gpt-4o": [
-              {"name": "Image generation", "description": "generating image"}
-            ]
-        }
-        "#;
-        let llm_routes =
-            serde_json::from_str::<HashMap<String, Vec<RoutingPreference>>>(routes_str).unwrap();
-
-        let router = RouterModelV1::new(llm_routes, "test-model".to_string(), 2000);
-
-        // Case 1: Valid JSON with non-empty route
-        let input = r#"{"route": "Image generation"}"#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(
-            result,
-            Some(("Image generation".to_string(), "gpt-4o".to_string()))
-        );
-
-        // Case 2: Valid JSON with empty route
-        let input = r#"{"route": ""}"#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(result, None);
-
-        // Case 3: Valid JSON with null route
-        let input = r#"{"route": null}"#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(result, None);
-
-        // Case 4: JSON missing route field
-        let input = r#"{}"#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(result, None);
-
-        // Case 4.1: empty string
-        let input = r#""#;
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(result, None);
-
-        // Case 5: Malformed JSON
-        let input = r#"{"route": "route1""#; // missing closing }
-        let result = router.parse_response(input, &None);
-        assert!(result.is_err());
-
-        // Case 6: Single quotes and \n in JSON
-        let input = "{'route': 'Image generation'}\\n";
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(
-            result,
-            Some(("Image generation".to_string(), "gpt-4o".to_string()))
-        );
-
-        // Case 7: Code block marker
-        let input = "```json\n{\"route\": \"Image generation\"}\n```";
-        let result = router.parse_response(input, &None).unwrap();
-        assert_eq!(
-            result,
-            Some(("Image generation".to_string(), "gpt-4o".to_string()))
-        );
-    }
-}