From bc059aed4d80c3a0d6123bf7a506da3dc1d7f416 Mon Sep 17 00:00:00 2001
From: Adil Hafeez <adil.hafeez@gmail.com>
Date: Sun, 15 Mar 2026 09:36:11 -0700
Subject: [PATCH] Unified overrides for custom router and orchestrator models
 (#820)

* support configurable orchestrator model via orchestration config section

* add self-hosting docs and demo for Plano-Orchestrator

* list all Plano-Orchestrator model variants in docs

* use overrides for custom routing and orchestration model

* update docs

* update orchestrator model name

* rename arch provider to plano, use llm_routing_model and agent_orchestration_model

* regenerate rendered config reference
---
 cli/planoai/config_generator.py               | 62 +++++++-------
 config/envoy.template.yaml                    |  4 +-
 config/plano_config_schema.yaml               | 18 ++--
 .../src/handlers/agent_selector.rs            |  1 +
 .../src/handlers/integration_tests.rs         |  1 +
 crates/brightstaff/src/main.rs                | 45 +++++++---
 .../src/router/plano_orchestrator.rs          | 18 ++--
 crates/common/src/configuration.rs            | 19 ++---
 crates/common/src/consts.rs                   |  3 +-
 crates/hermesllm/src/lib.rs                   |  2 +-
 crates/hermesllm/src/providers/id.rs          | 10 +--
 crates/llm_gateway/src/stream_context.rs      |  2 +-
 .../travel_agents/README.md                   | 36 ++++++++
 .../config_local_orchestrator.yaml            | 66 +++++++++++++++
 .../travel_agents/run_demo.sh                 |  9 +-
 .../llm_routing/openclaw_routing/config.yaml  |  5 +-
 .../plano_config_local.yaml                   |  8 +-
 docs/source/guides/llm_router.rst             | 16 ++--
 docs/source/guides/orchestration.rst          | 84 +++++++++++++++++++
 .../plano_config_full_reference_rendered.yaml |  6 +-
 20 files changed, 312 insertions(+), 103 deletions(-)
 create mode 100644 demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml

diff --git a/cli/planoai/config_generator.py b/cli/planoai/config_generator.py
index 522968c9..c4e5831b 100644
--- a/cli/planoai/config_generator.py
+++ b/cli/planoai/config_generator.py
@@ -8,13 +8,12 @@ from urllib.parse import urlparse
 from copy import deepcopy
 from planoai.consts import DEFAULT_OTEL_TRACING_GRPC_ENDPOINT
 
-
 SUPPORTED_PROVIDERS_WITH_BASE_URL = [
     "azure_openai",
     "ollama",
     "qwen",
     "amazon_bedrock",
-    "arch",
+    "plano",
 ]
 
 SUPPORTED_PROVIDERS_WITHOUT_BASE_URL = [
@@ -368,47 +367,52 @@ def validate_and_render_schema():
                     llms_with_endpoint.append(model_provider)
                     llms_with_endpoint_cluster_names.add(cluster_name)
 
-    if len(model_usage_name_keys) > 0:
-        routing_model_provider = config_yaml.get("routing", {}).get(
-            "model_provider", None
+    overrides_config = config_yaml.get("overrides", {})
+    # Build lookup of model names (already prefix-stripped by config processing)
+    model_name_set = {mp.get("model") for mp in updated_model_providers}
+
+    # Auto-add arch-router provider if routing preferences exist and no provider matches the router model
+    router_model = overrides_config.get("llm_routing_model", "Arch-Router")
+    # Strip provider prefix for comparison since config processing strips prefixes from model names
+    router_model_id = (
+        router_model.split("/", 1)[1] if "/" in router_model else router_model
+    )
+    if len(model_usage_name_keys) > 0 and router_model_id not in model_name_set:
+        updated_model_providers.append(
+            {
+                "name": "arch-router",
+                "provider_interface": "plano",
+                "model": router_model_id,
+                "internal": True,
+            }
         )
-        if (
-            routing_model_provider
-            and routing_model_provider not in model_provider_name_set
-        ):
-            raise Exception(
-                f"Routing model_provider {routing_model_provider} is not defined in model_providers"
-            )
-        if (
-            routing_model_provider is None
-            and "arch-router" not in model_provider_name_set
-        ):
-            updated_model_providers.append(
-                {
-                    "name": "arch-router",
-                    "provider_interface": "arch",
-                    "model": config_yaml.get("routing", {}).get("model", "Arch-Router"),
-                    "internal": True,
-                }
-            )
 
     # Always add arch-function model provider if not already defined
     if "arch-function" not in model_provider_name_set:
         updated_model_providers.append(
             {
                 "name": "arch-function",
-                "provider_interface": "arch",
+                "provider_interface": "plano",
                 "model": "Arch-Function",
                 "internal": True,
             }
         )
 
-    if "plano-orchestrator" not in model_provider_name_set:
+    # Auto-add plano-orchestrator provider if no provider matches the orchestrator model
+    orchestrator_model = overrides_config.get(
+        "agent_orchestration_model", "Plano-Orchestrator"
+    )
+    orchestrator_model_id = (
+        orchestrator_model.split("/", 1)[1]
+        if "/" in orchestrator_model
+        else orchestrator_model
+    )
+    if orchestrator_model_id not in model_name_set:
         updated_model_providers.append(
             {
-                "name": "plano-orchestrator",
-                "provider_interface": "arch",
-                "model": "Plano-Orchestrator",
+                "name": "plano/orchestrator",
+                "provider_interface": "plano",
+                "model": orchestrator_model_id,
                 "internal": True,
             }
         )
diff --git a/config/envoy.template.yaml b/config/envoy.template.yaml
index a780c3f1..c2dd5ed0 100644
--- a/config/envoy.template.yaml
+++ b/config/envoy.template.yaml
@@ -594,13 +594,13 @@ static_resources:
 
   clusters:
 
-    - name: arch
+    - name: plano
       connect_timeout: {{ upstream_connect_timeout | default('5s') }}
       type: LOGICAL_DNS
       dns_lookup_family: V4_ONLY
       lb_policy: ROUND_ROBIN
       load_assignment:
-        cluster_name: arch
+        cluster_name: plano
         endpoints:
           - lb_endpoints:
               - endpoint:
diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml
index b63cb824..e204e298 100644
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@@ -173,7 +173,7 @@ properties:
         provider_interface:
           type: string
           enum:
-            - arch
+            - plano
             - claude
             - deepseek
             - groq
@@ -220,7 +220,7 @@ properties:
         provider_interface:
           type: string
           enum:
-            - arch
+            - plano
             - claude
             - deepseek
             - groq
@@ -271,6 +271,12 @@ properties:
       upstream_tls_ca_path:
         type: string
         description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'."
+      llm_routing_model:
+        type: string
+        description: "Model name for the LLM router (e.g., 'Arch-Router'). Must match a model in model_providers."
+      agent_orchestration_model:
+        type: string
+        description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
   system_prompt:
     type: string
   prompt_targets:
@@ -408,14 +414,6 @@ properties:
     enum:
       - llm
       - prompt
-  routing:
-    type: object
-    properties:
-      llm_provider:
-        type: string
-      model:
-        type: string
-    additionalProperties: false
   state_storage:
     type: object
     properties:
diff --git a/crates/brightstaff/src/handlers/agent_selector.rs b/crates/brightstaff/src/handlers/agent_selector.rs
index faa734ee..a1b38b2c 100644
--- a/crates/brightstaff/src/handlers/agent_selector.rs
+++ b/crates/brightstaff/src/handlers/agent_selector.rs
@@ -178,6 +178,7 @@ mod tests {
         Arc::new(OrchestratorService::new(
             "http://localhost:8080".to_string(),
             "test-model".to_string(),
+            "plano-orchestrator".to_string(),
         ))
     }
 
diff --git a/crates/brightstaff/src/handlers/integration_tests.rs b/crates/brightstaff/src/handlers/integration_tests.rs
index 70b2999d..b440e198 100644
--- a/crates/brightstaff/src/handlers/integration_tests.rs
+++ b/crates/brightstaff/src/handlers/integration_tests.rs
@@ -23,6 +23,7 @@ mod tests {
         Arc::new(OrchestratorService::new(
             "http://localhost:8080".to_string(),
             "test-model".to_string(),
+            "plano-orchestrator".to_string(),
         ))
     }
 
diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs
index 51c9127f..b9faafbf 100644
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@@ -11,9 +11,7 @@ use brightstaff::state::StateStorage;
 use brightstaff::utils::tracing::init_tracer;
 use bytes::Bytes;
 use common::configuration::{Agent, Configuration};
-use common::consts::{
-    CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH, PLANO_ORCHESTRATOR_MODEL_NAME,
-};
+use common::consts::{CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH};
 use common::llm_providers::LlmProviders;
 use http_body_util::{combinators::BoxBody, BodyExt, Empty};
 use hyper::body::Incoming;
@@ -35,6 +33,8 @@ pub mod router;
 const BIND_ADDRESS: &str = "0.0.0.0:9091";
 const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router";
 const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router";
+const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator";
+const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
 
 // Utility function to extract the context from the incoming request headers
 fn extract_context_from_request(req: &Request<Incoming>) -> Context {
@@ -90,16 +90,21 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
         env::var("LLM_PROVIDER_ENDPOINT").unwrap_or_else(|_| "http://localhost:12001".to_string());
 
     let listener = TcpListener::bind(bind_address).await?;
-    let routing_model_name: String = plano_config
-        .routing
-        .as_ref()
-        .and_then(|r| r.model.clone())
-        .unwrap_or_else(|| DEFAULT_ROUTING_MODEL_NAME.to_string());
+    let overrides = plano_config.overrides.clone().unwrap_or_default();
+
+    // Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests
+    let routing_model_name: String = overrides
+        .llm_routing_model
+        .as_deref()
+        .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
+        .unwrap_or(DEFAULT_ROUTING_MODEL_NAME)
+        .to_string();
 
     let routing_llm_provider = plano_config
-        .routing
-        .as_ref()
-        .and_then(|r| r.model_provider.clone())
+        .model_providers
+        .iter()
+        .find(|p| p.model.as_deref() == Some(routing_model_name.as_str()))
+        .map(|p| p.name.clone())
         .unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string());
 
     let router_service: Arc<RouterService> = Arc::new(RouterService::new(
@@ -109,9 +114,25 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
         routing_llm_provider,
     ));
 
+    // Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests
+    let orchestrator_model_name: String = overrides
+        .agent_orchestration_model
+        .as_deref()
+        .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
+        .unwrap_or(DEFAULT_ORCHESTRATOR_MODEL_NAME)
+        .to_string();
+
+    let orchestrator_llm_provider: String = plano_config
+        .model_providers
+        .iter()
+        .find(|p| p.model.as_deref() == Some(orchestrator_model_name.as_str()))
+        .map(|p| p.name.clone())
+        .unwrap_or_else(|| DEFAULT_ORCHESTRATOR_LLM_PROVIDER.to_string());
+
     let orchestrator_service: Arc<OrchestratorService> = Arc::new(OrchestratorService::new(
         format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
-        PLANO_ORCHESTRATOR_MODEL_NAME.to_string(),
+        orchestrator_model_name,
+        orchestrator_llm_provider,
     ));
 
     let model_aliases = Arc::new(plano_config.model_aliases.clone());
diff --git a/crates/brightstaff/src/router/plano_orchestrator.rs b/crates/brightstaff/src/router/plano_orchestrator.rs
index cf2688b9..12140570 100644
--- a/crates/brightstaff/src/router/plano_orchestrator.rs
+++ b/crates/brightstaff/src/router/plano_orchestrator.rs
@@ -2,7 +2,7 @@ use std::{collections::HashMap, sync::Arc};
 
 use common::{
     configuration::{AgentUsagePreference, OrchestrationPreference},
-    consts::{ARCH_PROVIDER_HINT_HEADER, PLANO_ORCHESTRATOR_MODEL_NAME, REQUEST_ID_HEADER},
+    consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER},
 };
 use hermesllm::apis::openai::{ChatCompletionsResponse, Message};
 use hyper::header;
@@ -19,6 +19,7 @@ pub struct OrchestratorService {
     orchestrator_url: String,
     client: reqwest::Client,
     orchestrator_model: Arc<dyn OrchestratorModel>,
+    orchestrator_provider_name: String,
 }
 
 #[derive(Debug, Error)]
@@ -36,7 +37,11 @@ pub enum OrchestrationError {
 pub type Result<T> = std::result::Result<T, OrchestrationError>;
 
 impl OrchestratorService {
-    pub fn new(orchestrator_url: String, orchestration_model_name: String) -> Self {
+    pub fn new(
+        orchestrator_url: String,
+        orchestration_model_name: String,
+        orchestrator_provider_name: String,
+    ) -> Self {
         // Empty agent orchestrations - will be provided via usage_preferences in requests
         let agent_orchestrations: HashMap<String, Vec<OrchestrationPreference>> = HashMap::new();
 
@@ -50,6 +55,7 @@ impl OrchestratorService {
             orchestrator_url,
             client: reqwest::Client::new(),
             orchestrator_model,
+            orchestrator_provider_name,
         }
     }
 
@@ -75,12 +81,12 @@ impl OrchestratorService {
         debug!(
             model = %self.orchestrator_model.get_model_name(),
             endpoint = %self.orchestrator_url,
-            "sending request to arch-orchestrator"
+            "sending request to plano-orchestrator"
         );
 
         debug!(
             body = %serde_json::to_string(&orchestrator_request).unwrap(),
-            "arch orchestrator request"
+            "plano orchestrator request"
         );
 
         let mut orchestration_request_headers = header::HeaderMap::new();
@@ -91,7 +97,7 @@ impl OrchestratorService {
 
         orchestration_request_headers.insert(
             header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER),
-            header::HeaderValue::from_str(PLANO_ORCHESTRATOR_MODEL_NAME).unwrap(),
+            header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(),
         );
 
         // Inject OpenTelemetry trace context from current span
@@ -110,7 +116,7 @@ impl OrchestratorService {
 
         orchestration_request_headers.insert(
             header::HeaderName::from_static("model"),
-            header::HeaderValue::from_static(PLANO_ORCHESTRATOR_MODEL_NAME),
+            header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(),
         );
 
         let start_time = std::time::Instant::now();
diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs
index f4e2b7b4..12c7d73f 100644
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@@ -7,12 +7,6 @@ use crate::api::open_ai::{
     ChatCompletionTool, FunctionDefinition, FunctionParameter, FunctionParameters, ParameterType,
 };
 
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Routing {
-    pub model_provider: Option<String>,
-    pub model: Option<String>,
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ModelAlias {
     pub target: String,
@@ -72,7 +66,6 @@ pub struct Configuration {
     pub ratelimits: Option<Vec<Ratelimit>>,
     pub tracing: Option<Tracing>,
     pub mode: Option<GatewayMode>,
-    pub routing: Option<Routing>,
     pub agents: Option<Vec<Agent>>,
     pub filters: Option<Vec<Agent>>,
     pub listeners: Vec<Listener>,
@@ -84,6 +77,8 @@ pub struct Overrides {
     pub prompt_target_intent_matching_threshold: Option<f64>,
     pub optimize_context_window: Option<bool>,
     pub use_agent_orchestrator: Option<bool>,
+    pub llm_routing_model: Option<String>,
+    pub agent_orchestration_model: Option<String>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@@ -207,8 +202,6 @@ pub struct EmbeddingProviver {
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
 pub enum LlmProviderType {
-    #[serde(rename = "arch")]
-    Arch,
     #[serde(rename = "anthropic")]
     Anthropic,
     #[serde(rename = "deepseek")]
@@ -237,12 +230,13 @@ pub enum LlmProviderType {
     Qwen,
     #[serde(rename = "amazon_bedrock")]
     AmazonBedrock,
+    #[serde(rename = "plano")]
+    Plano,
 }
 
 impl Display for LlmProviderType {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            LlmProviderType::Arch => write!(f, "arch"),
             LlmProviderType::Anthropic => write!(f, "anthropic"),
             LlmProviderType::Deepseek => write!(f, "deepseek"),
             LlmProviderType::Groq => write!(f, "groq"),
@@ -257,6 +251,7 @@ impl Display for LlmProviderType {
             LlmProviderType::Zhipu => write!(f, "zhipu"),
             LlmProviderType::Qwen => write!(f, "qwen"),
             LlmProviderType::AmazonBedrock => write!(f, "amazon_bedrock"),
+            LlmProviderType::Plano => write!(f, "plano"),
         }
     }
 }
@@ -591,14 +586,14 @@ mod test {
             },
             LlmProvider {
                 name: "arch-router".to_string(),
-                provider_interface: LlmProviderType::Arch,
+                provider_interface: LlmProviderType::Plano,
                 model: Some("Arch-Router".to_string()),
                 internal: Some(true),
                 ..Default::default()
             },
             LlmProvider {
                 name: "plano-orchestrator".to_string(),
-                provider_interface: LlmProviderType::Arch,
+                provider_interface: LlmProviderType::Plano,
                 model: Some("Plano-Orchestrator".to_string()),
                 internal: Some(true),
                 ..Default::default()
diff --git a/crates/common/src/consts.rs b/crates/common/src/consts.rs
index cafc8e80..dbd0bc41 100644
--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@@ -33,5 +33,4 @@ pub const OTEL_COLLECTOR_HTTP: &str = "opentelemetry_collector_http";
 pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route";
 pub const ENVOY_RETRY_HEADER: &str = "x-envoy-max-retries";
 pub const BRIGHT_STAFF_SERVICE_NAME: &str = "brightstaff";
-pub const PLANO_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
-pub const ARCH_FC_CLUSTER: &str = "arch";
+pub const PLANO_FC_CLUSTER: &str = "plano";
diff --git a/crates/hermesllm/src/lib.rs b/crates/hermesllm/src/lib.rs
index 997fc72a..3b9611e0 100644
--- a/crates/hermesllm/src/lib.rs
+++ b/crates/hermesllm/src/lib.rs
@@ -35,7 +35,7 @@ mod tests {
             ProviderId::Mistral
         );
         assert_eq!(ProviderId::try_from("groq").unwrap(), ProviderId::Groq);
-        assert_eq!(ProviderId::try_from("arch").unwrap(), ProviderId::Arch);
+        assert_eq!(ProviderId::try_from("plano").unwrap(), ProviderId::Plano);
 
         // Test aliases
         assert_eq!(ProviderId::try_from("google").unwrap(), ProviderId::Gemini);
diff --git a/crates/hermesllm/src/providers/id.rs b/crates/hermesllm/src/providers/id.rs
index 11008711..9f5f42c9 100644
--- a/crates/hermesllm/src/providers/id.rs
+++ b/crates/hermesllm/src/providers/id.rs
@@ -34,7 +34,7 @@ pub enum ProviderId {
     Gemini,
     Anthropic,
     GitHub,
-    Arch,
+    Plano,
     AzureOpenAI,
     XAI,
     TogetherAI,
@@ -58,7 +58,7 @@ impl TryFrom<&str> for ProviderId {
             "google" => Ok(ProviderId::Gemini), // alias
             "anthropic" => Ok(ProviderId::Anthropic),
             "github" => Ok(ProviderId::GitHub),
-            "arch" => Ok(ProviderId::Arch),
+            "plano" => Ok(ProviderId::Plano),
             "azure_openai" => Ok(ProviderId::AzureOpenAI),
             "xai" => Ok(ProviderId::XAI),
             "together_ai" => Ok(ProviderId::TogetherAI),
@@ -135,7 +135,7 @@ impl ProviderId {
                 | ProviderId::Groq
                 | ProviderId::Mistral
                 | ProviderId::Deepseek
-                | ProviderId::Arch
+                | ProviderId::Plano
                 | ProviderId::Gemini
                 | ProviderId::GitHub
                 | ProviderId::AzureOpenAI
@@ -153,7 +153,7 @@ impl ProviderId {
                 | ProviderId::Groq
                 | ProviderId::Mistral
                 | ProviderId::Deepseek
-                | ProviderId::Arch
+                | ProviderId::Plano
                 | ProviderId::Gemini
                 | ProviderId::GitHub
                 | ProviderId::AzureOpenAI
@@ -219,7 +219,7 @@ impl Display for ProviderId {
             ProviderId::Gemini => write!(f, "Gemini"),
             ProviderId::Anthropic => write!(f, "Anthropic"),
             ProviderId::GitHub => write!(f, "GitHub"),
-            ProviderId::Arch => write!(f, "Arch"),
+            ProviderId::Plano => write!(f, "Plano"),
             ProviderId::AzureOpenAI => write!(f, "azure_openai"),
             ProviderId::XAI => write!(f, "xai"),
             ProviderId::TogetherAI => write!(f, "together_ai"),
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index 7a353bcb..f62631fa 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -873,7 +873,7 @@ impl HttpContext for StreamContext {
                 // ensure that the provider has an endpoint if the access key is missing else return a bad request
                 if self.llm_provider.as_ref().unwrap().endpoint.is_none()
                     && self.llm_provider.as_ref().unwrap().provider_interface
-                        != LlmProviderType::Arch
+                        != LlmProviderType::Plano
                 {
                     self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
                 }
diff --git a/demos/agent_orchestration/travel_agents/README.md b/demos/agent_orchestration/travel_agents/README.md
index 7886539d..239ba938 100644
--- a/demos/agent_orchestration/travel_agents/README.md
+++ b/demos/agent_orchestration/travel_agents/README.md
@@ -123,6 +123,42 @@ Each agent:
 
 Both agents run as native local processes and communicate with Plano running natively on the host.
 
+## Running with local Plano-Orchestrator (via vLLM)
+
+By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model locally using vLLM on a server with an NVIDIA GPU:
+
+1. Install vLLM and download the model:
+```bash
+pip install vllm
+```
+
+2. Start the vLLM server with the 4B model:
+```bash
+vllm serve katanemo/Plano-Orchestrator-4B \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.3 \
+    --tokenizer katanemo/Plano-Orchestrator-4B \
+    --chat-template chat_template.jinja \
+    --served-model-name katanemo/Plano-Orchestrator-4B \
+    --enable-prefix-caching
+```
+
+3. Start the demo with the local orchestrator config:
+```bash
+./run_demo.sh --local-orchestrator
+```
+
+4. Test with curl:
+```bash
+curl -X POST http://localhost:8001/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "gpt-5.2", "messages": [{"role": "user", "content": "What is the weather in Istanbul?"}]}'
+```
+
+You should see Plano use your local orchestrator to route the request to the weather agent.
+
 ## Observability
 
 This demo includes full OpenTelemetry (OTel) compatible distributed tracing to monitor and debug agent interactions:
diff --git a/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml b/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml
new file mode 100644
index 00000000..1d3a0be8
--- /dev/null
+++ b/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml
@@ -0,0 +1,66 @@
+version: v0.3.0
+
+overrides:
+  agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B
+
+agents:
+  - id: weather_agent
+    url: http://localhost:10510
+  - id: flight_agent
+    url: http://localhost:10520
+
+model_providers:
+  - model: plano/katanemo/Plano-Orchestrator-4B
+    base_url: http://localhost:8000
+
+  - model: openai/gpt-5.2
+    access_key: $OPENAI_API_KEY
+    default: true
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY # smaller, faster, cheaper model for extracting entities like location
+
+listeners:
+  - type: agent
+    name: travel_booking_service
+    port: 8001
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: |
+
+          WeatherAgent is a specialized AI assistant for real-time weather information and forecasts. It provides accurate weather data for any city worldwide using the Open-Meteo API, helping travelers plan their trips with up-to-date weather conditions.
+
+          Capabilities:
+            * Get real-time weather conditions and multi-day forecasts for any city worldwide using Open-Meteo API (free, no API key needed)
+            * Provides current temperature
+            * Provides multi-day forecasts
+            * Provides weather conditions
+            * Provides sunrise/sunset times
+            * Provides detailed weather information
+            * Understands conversation context to resolve location references from previous messages
+            * Handles weather-related questions including "What's the weather in [city]?", "What's the forecast for [city]?", "How's the weather in [city]?"
+            * When queries include both weather and other travel questions (e.g., flights, currency), this agent answers ONLY the weather part
+
+      - id: flight_agent
+        description: |
+
+          FlightAgent is an AI-powered tool specialized in providing live flight information between airports. It leverages the FlightAware AeroAPI to deliver real-time flight status, gate information, and delay updates.
+
+          Capabilities:
+            * Get live flight information between airports using FlightAware AeroAPI
+            * Shows real-time flight status
+            * Shows scheduled/estimated/actual departure and arrival times
+            * Shows gate and terminal information
+            * Shows delays
+            * Shows aircraft type
+            * Shows flight status
+            * Automatically resolves city names to airport codes (IATA/ICAO)
+            * Understands conversation context to infer origin/destination from follow-up questions
+            * Handles flight-related questions including "What flights go from [city] to [city]?", "Do flights go to [city]?", "Are there direct flights from [city]?"
+            * When queries include both flight and other travel questions (e.g., weather, currency), this agent answers ONLY the flight part
+
+tracing:
+  random_sampling: 100
+  span_attributes:
+    header_prefixes:
+      - x-acme-
diff --git a/demos/agent_orchestration/travel_agents/run_demo.sh b/demos/agent_orchestration/travel_agents/run_demo.sh
index 643a0aa2..35166b85 100755
--- a/demos/agent_orchestration/travel_agents/run_demo.sh
+++ b/demos/agent_orchestration/travel_agents/run_demo.sh
@@ -31,8 +31,13 @@ start_demo() {
   fi
 
   # Step 4: Start Plano
-  echo "Starting Plano with config.yaml..."
-  planoai up config.yaml
+  PLANO_CONFIG="config.yaml"
+  if [ "$1" == "--local-orchestrator" ]; then
+    PLANO_CONFIG="config_local_orchestrator.yaml"
+    echo "Using local orchestrator config..."
+  fi
+  echo "Starting Plano with $PLANO_CONFIG..."
+  planoai up "$PLANO_CONFIG"
 
   # Step 5: Start agents natively
   echo "Starting agents..."
diff --git a/demos/llm_routing/openclaw_routing/config.yaml b/demos/llm_routing/openclaw_routing/config.yaml
index 3106b5dd..9690e747 100644
--- a/demos/llm_routing/openclaw_routing/config.yaml
+++ b/demos/llm_routing/openclaw_routing/config.yaml
@@ -1,8 +1,7 @@
 version: v0.1.0
 
-routing:
-  model: Arch-Router
-  llm_provider: arch-router
+overrides:
+  llm_routing_model: Arch-Router
 
 listeners:
   egress_traffic:
diff --git a/demos/llm_routing/preference_based_routing/plano_config_local.yaml b/demos/llm_routing/preference_based_routing/plano_config_local.yaml
index dbd287dd..01adb097 100644
--- a/demos/llm_routing/preference_based_routing/plano_config_local.yaml
+++ b/demos/llm_routing/preference_based_routing/plano_config_local.yaml
@@ -1,8 +1,7 @@
 version: v0.3.0
 
-routing:
-  model: Arch-Router
-  llm_provider: arch-router
+overrides:
+  llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
 
 listeners:
   - type: model
@@ -11,8 +10,7 @@ listeners:
 
 model_providers:
 
-  - name: arch-router
-    model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
+  - model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
     base_url: http://localhost:11434
 
   - model: openai/gpt-4o-mini
diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst
index 41c51b4a..0073a664 100644
--- a/docs/source/guides/llm_router.rst
+++ b/docs/source/guides/llm_router.rst
@@ -253,13 +253,11 @@ Using Ollama (recommended for local development)
 
    .. code-block:: yaml
 
-       routing:
-         model: Arch-Router
-         llm_provider: arch-router
+       overrides:
+         llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
 
        model_providers:
-         - name: arch-router
-           model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
+         - model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
            base_url: http://localhost:11434
 
          - model: openai/gpt-5.2
@@ -324,13 +322,11 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
 
    .. code-block:: yaml
 
-       routing:
-         model: Arch-Router
-         llm_provider: arch-router
+       overrides:
+         llm_routing_model: plano/Arch-Router
 
        model_providers:
-         - name: arch-router
-           model: Arch-Router
+         - model: plano/Arch-Router
            base_url: http://<your-server-ip>:10000
 
          - model: openai/gpt-5.2
diff --git a/docs/source/guides/orchestration.rst b/docs/source/guides/orchestration.rst
index 3170b65f..1a153e83 100644
--- a/docs/source/guides/orchestration.rst
+++ b/docs/source/guides/orchestration.rst
@@ -335,6 +335,90 @@ Combine RAG agents for documentation lookup with specialized troubleshooting age
       - id: troubleshoot_agent
         description: Diagnoses and resolves technical issues step by step
 
+Self-hosting Plano-Orchestrator
+-------------------------------
+
+By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model, you can serve it using **vLLM** on a server with an NVIDIA GPU.
+
+.. note::
+   vLLM requires a Linux server with an NVIDIA GPU (CUDA). For local development on macOS, a GGUF version for Ollama is coming soon.
+
+The following model variants are available on HuggingFace:
+
+* `Plano-Orchestrator-4B <https://huggingface.co/katanemo/Plano-Orchestrator-4B>`_ — lighter model, suitable for development and testing
+* `Plano-Orchestrator-4B-FP8 <https://huggingface.co/katanemo/Plano-Orchestrator-4B-FP8>`_ — FP8 quantized 4B model, lower memory usage
+* `Plano-Orchestrator-30B-A3B <https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B>`_ — full-size model for production
+* `Plano-Orchestrator-30B-A3B-FP8 <https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B-FP8>`_ — FP8 quantized 30B model, recommended for production deployments
+
+Using vLLM
+~~~~~~~~~~
+
+1. **Install vLLM**
+
+   .. code-block:: bash
+
+       pip install vllm
+
+2. **Download the model and chat template**
+
+   .. code-block:: bash
+
+       pip install huggingface_hub
+       huggingface-cli download katanemo/Plano-Orchestrator-4B
+
+3. **Start the vLLM server**
+
+   For the 4B model (development):
+
+   .. code-block:: bash
+
+       vllm serve katanemo/Plano-Orchestrator-4B \
+           --host 0.0.0.0 \
+           --port 8000 \
+           --tensor-parallel-size 1 \
+           --gpu-memory-utilization 0.3 \
+           --tokenizer katanemo/Plano-Orchestrator-4B \
+           --chat-template chat_template.jinja \
+           --served-model-name katanemo/Plano-Orchestrator-4B \
+           --enable-prefix-caching
+
+   For the 30B-A3B-FP8 model (production):
+
+   .. code-block:: bash
+
+       vllm serve katanemo/Plano-Orchestrator-30B-A3B-FP8 \
+           --host 0.0.0.0 \
+           --port 8000 \
+           --tensor-parallel-size 1 \
+           --gpu-memory-utilization 0.9 \
+           --tokenizer katanemo/Plano-Orchestrator-30B-A3B-FP8 \
+           --chat-template chat_template.jinja \
+           --max-model-len 32768 \
+           --served-model-name katanemo/Plano-Orchestrator-30B-A3B-FP8 \
+           --enable-prefix-caching
+
+4. **Configure Plano to use the local orchestrator**
+
+   Use the model name matching your ``--served-model-name``:
+
+   .. code-block:: yaml
+
+       overrides:
+         agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B
+
+       model_providers:
+         - model: katanemo/Plano-Orchestrator-4B
+           provider_interface: plano
+           base_url: http://<your-server-ip>:8000
+
+5. **Verify the server is running**
+
+   .. code-block:: bash
+
+       curl http://localhost:8000/health
+       curl http://localhost:8000/v1/models
+
+
 Next Steps
 ----------
 
diff --git a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
index 9717b53a..64ee1f91 100644
--- a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
+++ b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
@@ -107,11 +107,11 @@ model_providers:
 - internal: true
   model: Arch-Function
   name: arch-function
-  provider_interface: arch
+  provider_interface: plano
 - internal: true
   model: Plano-Orchestrator
-  name: plano-orchestrator
-  provider_interface: arch
+  name: plano/orchestrator
+  provider_interface: plano
 prompt_targets:
 - description: Get current weather at a location.
   endpoint: