resolve merge conflict in main.rs

2026-05-21 13:55:15 +02:00 · 2026-03-16 12:40:33 -07:00 · 2026-03-16 12:40:33 -07:00 · 80dfb41cad
commit 80dfb41cad
parent 6fe7613bcd 5388c6777f
40 changed files with 920 additions and 301 deletions
--- a/.claude/skills/release/SKILL.md
+++ b/.claude/skills/release/SKILL.md
@ -25,4 +25,6 @@ Update the version string in ALL of these files:
 Do NOT change version strings in `*.lock` files or `Cargo.lock`.
 After updating all version strings, run `cd cli && uv lock` to update the lock file with the new version.
 After making changes, show a summary of all files modified and the old → new version.
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -133,13 +133,13 @@ jobs:
          load: true
          tags: |
            ${{ env.PLANO_DOCKER_IMAGE }}
-            ${{ env.DOCKER_IMAGE }}:0.4.11
+            ${{ env.DOCKER_IMAGE }}:0.4.12
            ${{ env.DOCKER_IMAGE }}:latest
          cache-from: type=gha
          cache-to: type=gha,mode=max
      - name: Save image as artifact
-        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.11 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
+        run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.12 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
      - name: Upload image artifact
        uses: actions/upload-artifact@v6
--- a/.gitignore
+++ b/.gitignore
@ -152,3 +152,4 @@ apps/*/dist/
 .cursor/
 .agents
 docs/do/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -4,6 +4,7 @@ repos:
    hooks:
      - id: check-yaml
        exclude: config/envoy.template*
        args: [--allow-multiple-documents]
      - id: end-of-file-fixer
      - id: trailing-whitespace
  - repo: local
--- a/apps/www/src/components/Hero.tsx
+++ b/apps/www/src/components/Hero.tsx
@ -24,7 +24,7 @@ export function Hero() {
            >
              <div className="inline-flex flex-wrap items-center gap-1.5 sm:gap-2 px-3 sm:px-4 py-1 rounded-full bg-[rgba(185,191,255,0.4)] border border-[var(--secondary)] shadow backdrop-blur hover:bg-[rgba(185,191,255,0.6)] transition-colors cursor-pointer">
                <span className="text-xs sm:text-sm font-medium text-black/65">
-                  v0.4.11
+                  v0.4.12
                </span>
                <span className="text-xs sm:text-sm font-medium text-black ">
                  —
--- a/build_filter_image.sh
+++ b/build_filter_image.sh
@ -1 +1 @@
-docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.11
+docker build  -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.12
--- a/cli/planoai/init.py
+++ b/cli/planoai/init.py
@ -1,3 +1,3 @@
 """Plano CLI - Intelligent Prompt Gateway."""
-__version__ = "0.4.11"
+__version__ = "0.4.12"
--- a/cli/planoai/config_generator.py
+++ b/cli/planoai/config_generator.py
@ -3,18 +3,17 @@ import os
 from planoai.utils import convert_legacy_listeners
 from jinja2 import Environment, FileSystemLoader
 import yaml
-from jsonschema import validate
+from jsonschema import validate, ValidationError
 from urllib.parse import urlparse
 from copy import deepcopy
 from planoai.consts import DEFAULT_OTEL_TRACING_GRPC_ENDPOINT
 SUPPORTED_PROVIDERS_WITH_BASE_URL = [
    "azure_openai",
    "ollama",
    "qwen",
    "amazon_bedrock",
-    "arch",
+    "plano",
 ]
 SUPPORTED_PROVIDERS_WITHOUT_BASE_URL = [
@ -368,47 +367,52 @@ def validate_and_render_schema():
                    llms_with_endpoint.append(model_provider)
                    llms_with_endpoint_cluster_names.add(cluster_name)
-    if len(model_usage_name_keys) > 0:
+    overrides_config = config_yaml.get("overrides", {})
-        routing_model_provider = config_yaml.get("routing", {}).get(
+    # Build lookup of model names (already prefix-stripped by config processing)
-            "model_provider", None
+    model_name_set = {mp.get("model") for mp in updated_model_providers}
    # Auto-add arch-router provider if routing preferences exist and no provider matches the router model
    router_model = overrides_config.get("llm_routing_model", "Arch-Router")
    # Strip provider prefix for comparison since config processing strips prefixes from model names
    router_model_id = (
        router_model.split("/", 1)[1] if "/" in router_model else router_model
    )
    if len(model_usage_name_keys) > 0 and router_model_id not in model_name_set:
        updated_model_providers.append(
            {
                "name": "arch-router",
                "provider_interface": "plano",
                "model": router_model_id,
                "internal": True,
            }
        )
        if (
            routing_model_provider
            and routing_model_provider not in model_provider_name_set
        ):
            raise Exception(
                f"Routing model_provider {routing_model_provider} is not defined in model_providers"
            )
        if (
            routing_model_provider is None
            and "arch-router" not in model_provider_name_set
        ):
            updated_model_providers.append(
                {
                    "name": "arch-router",
                    "provider_interface": "arch",
                    "model": config_yaml.get("routing", {}).get("model", "Arch-Router"),
                    "internal": True,
                }
            )
    # Always add arch-function model provider if not already defined
    if "arch-function" not in model_provider_name_set:
        updated_model_providers.append(
            {
                "name": "arch-function",
-                "provider_interface": "arch",
+                "provider_interface": "plano",
                "model": "Arch-Function",
                "internal": True,
            }
        )
-    if "plano-orchestrator" not in model_provider_name_set:
+    # Auto-add plano-orchestrator provider if no provider matches the orchestrator model
    orchestrator_model = overrides_config.get(
        "agent_orchestration_model", "Plano-Orchestrator"
    )
    orchestrator_model_id = (
        orchestrator_model.split("/", 1)[1]
        if "/" in orchestrator_model
        else orchestrator_model
    )
    if orchestrator_model_id not in model_name_set:
        updated_model_providers.append(
            {
-                "name": "plano-orchestrator",
+                "name": "plano/orchestrator",
-                "provider_interface": "arch",
+                "provider_interface": "plano",
-                "model": "Plano-Orchestrator",
+                "model": orchestrator_model_id,
                "internal": True,
            }
        )
@ -513,11 +517,15 @@ def validate_prompt_config(plano_config_file, plano_config_schema_file):
    try:
        validate(config_yaml, config_schema_yaml)
-    except Exception as e:
+    except ValidationError as e:
-        print(
+        path = (
-            f"Error validating plano_config file: {plano_config_file}, schema file: {plano_config_schema_file}, error: {e}"
+            " → ".join(str(p) for p in e.absolute_path) if e.absolute_path else "root"
        )
-        raise e
+        raise ValidationError(
            f"{e.message}\n  Location: {path}\n  Value: {e.instance}"
        ) from None
    except Exception as e:
        raise
 if __name__ == "__main__":
--- a/cli/planoai/consts.py
+++ b/cli/planoai/consts.py
@ -5,7 +5,7 @@ PLANO_COLOR = "#969FF4"
 SERVICE_NAME_ARCHGW = "plano"
 PLANO_DOCKER_NAME = "plano"
-PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.11")
+PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.12")
 DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317"
 # Native mode constants
--- a/cli/planoai/native_runner.py
+++ b/cli/planoai/native_runner.py
@ -420,9 +420,16 @@ def native_validate_config(plano_config_file):
    with _temporary_env(overrides):
        from planoai.config_generator import validate_and_render_schema
-        # Suppress verbose print output from config_generator
+        # Suppress verbose print output from config_generator but capture errors
-        with contextlib.redirect_stdout(io.StringIO()):
+        captured = io.StringIO()
-            validate_and_render_schema()
+        try:
            with contextlib.redirect_stdout(captured):
                validate_and_render_schema()
        except SystemExit:
            # validate_and_render_schema calls exit(1) on failure after
            # printing to stdout; re-raise so the caller gets a useful message.
            output = captured.getvalue().strip()
            raise Exception(output) if output else Exception("Config validation failed")
 def native_logs(debug=False, follow=False):
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "planoai"
-version = "0.4.11"
+version = "0.4.12"
 description = "Python-based CLI tool to manage Plano."
 authors = [{name = "Katanemo Labs, Inc."}]
 readme = "README.md"
--- a/cli/uv.lock
+++ b/cli/uv.lock
@ -337,7 +337,7 @@ wheels = [
 [[package]]
 name = "planoai"
-version = "0.4.9"
+version = "0.4.12"
 source = { editable = "." }
 dependencies = [
    { name = "click" },
--- a/config/envoy.template.yaml
+++ b/config/envoy.template.yaml
@ -594,13 +594,13 @@ static_resources:
  clusters:
-    - name: arch
+    - name: plano
      connect_timeout: {{ upstream_connect_timeout | default('5s') }}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
      load_assignment:
-        cluster_name: arch
+        cluster_name: plano
        endpoints:
          - lb_endpoints:
              - endpoint:
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@ -181,7 +181,7 @@ properties:
        provider_interface:
          type: string
          enum:
-            - arch
+            - plano
            - claude
            - deepseek
            - groq
@ -228,7 +228,7 @@ properties:
        provider_interface:
          type: string
          enum:
-            - arch
+            - plano
            - claude
            - deepseek
            - groq
@ -279,6 +279,12 @@ properties:
      upstream_tls_ca_path:
        type: string
        description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'."
      llm_routing_model:
        type: string
        description: "Model name for the LLM router (e.g., 'Arch-Router'). Must match a model in model_providers."
      agent_orchestration_model:
        type: string
        description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
  system_prompt:
    type: string
  prompt_targets:
@ -416,14 +422,6 @@ properties:
    enum:
      - llm
      - prompt
  routing:
    type: object
    properties:
      llm_provider:
        type: string
      model:
        type: string
    additionalProperties: false
  state_storage:
    type: object
    properties:
--- a/crates/brightstaff/src/handlers/agent_selector.rs
+++ b/crates/brightstaff/src/handlers/agent_selector.rs
@ -178,6 +178,7 @@ mod tests {
        Arc::new(OrchestratorService::new(
            "http://localhost:8080".to_string(),
            "test-model".to_string(),
            "plano-orchestrator".to_string(),
        ))
    }
--- a/crates/brightstaff/src/handlers/integration_tests.rs
+++ b/crates/brightstaff/src/handlers/integration_tests.rs
@ -23,6 +23,7 @@ mod tests {
        Arc::new(OrchestratorService::new(
            "http://localhost:8080".to_string(),
            "test-model".to_string(),
            "plano-orchestrator".to_string(),
        ))
    }
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -11,9 +11,7 @@ use brightstaff::state::StateStorage;
 use brightstaff::utils::tracing::init_tracer;
 use bytes::Bytes;
 use common::configuration::{Agent, Configuration, ListenerType};
-use common::consts::{
+use common::consts::{CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH};
    CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH, PLANO_ORCHESTRATOR_MODEL_NAME,
 };
 use common::llm_providers::LlmProviders;
 use http_body_util::{combinators::BoxBody, BodyExt, Empty};
 use hyper::body::Incoming;
@ -36,6 +34,8 @@ pub mod router;
 const BIND_ADDRESS: &str = "0.0.0.0:9091";
 const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router";
 const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router";
 const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator";
 const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
 // Utility function to extract the context from the incoming request headers
 fn extract_context_from_request(req: &Request<Incoming>) -> Context {
@ -139,16 +139,21 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
        env::var("LLM_PROVIDER_ENDPOINT").unwrap_or_else(|_| "http://localhost:12001".to_string());
    let listener = TcpListener::bind(bind_address).await?;
-    let routing_model_name: String = plano_config
+    let overrides = plano_config.overrides.clone().unwrap_or_default();
-        .routing
+
-        .as_ref()
+    // Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests
-        .and_then(|r| r.model.clone())
+    let routing_model_name: String = overrides
-        .unwrap_or_else(|| DEFAULT_ROUTING_MODEL_NAME.to_string());
+        .llm_routing_model
        .as_deref()
        .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
        .unwrap_or(DEFAULT_ROUTING_MODEL_NAME)
        .to_string();
    let routing_llm_provider = plano_config
-        .routing
+        .model_providers
-        .as_ref()
+        .iter()
-        .and_then(|r| r.model_provider.clone())
+        .find(|p| p.model.as_deref() == Some(routing_model_name.as_str()))
        .map(|p| p.name.clone())
        .unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string());
    let router_service: Arc<RouterService> = Arc::new(RouterService::new(
@ -158,9 +163,25 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
        routing_llm_provider,
    ));
    // Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests
    let orchestrator_model_name: String = overrides
        .agent_orchestration_model
        .as_deref()
        .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
        .unwrap_or(DEFAULT_ORCHESTRATOR_MODEL_NAME)
        .to_string();
    let orchestrator_llm_provider: String = plano_config
        .model_providers
        .iter()
        .find(|p| p.model.as_deref() == Some(orchestrator_model_name.as_str()))
        .map(|p| p.name.clone())
        .unwrap_or_else(|| DEFAULT_ORCHESTRATOR_LLM_PROVIDER.to_string());
    let orchestrator_service: Arc<OrchestratorService> = Arc::new(OrchestratorService::new(
        format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
-        PLANO_ORCHESTRATOR_MODEL_NAME.to_string(),
+        orchestrator_model_name,
        orchestrator_llm_provider,
    ));
    let model_aliases = Arc::new(plano_config.model_aliases.clone());
--- a/crates/brightstaff/src/router/plano_orchestrator.rs
+++ b/crates/brightstaff/src/router/plano_orchestrator.rs
@ -2,7 +2,7 @@ use std::{collections::HashMap, sync::Arc};
 use common::{
    configuration::{AgentUsagePreference, OrchestrationPreference},
-    consts::{ARCH_PROVIDER_HINT_HEADER, PLANO_ORCHESTRATOR_MODEL_NAME, REQUEST_ID_HEADER},
+    consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER},
 };
 use hermesllm::apis::openai::{ChatCompletionsResponse, Message};
 use hyper::header;
@ -19,6 +19,7 @@ pub struct OrchestratorService {
    orchestrator_url: String,
    client: reqwest::Client,
    orchestrator_model: Arc<dyn OrchestratorModel>,
    orchestrator_provider_name: String,
 }
 #[derive(Debug, Error)]
@ -36,7 +37,11 @@ pub enum OrchestrationError {
 pub type Result<T> = std::result::Result<T, OrchestrationError>;
 impl OrchestratorService {
-    pub fn new(orchestrator_url: String, orchestration_model_name: String) -> Self {
+    pub fn new(
        orchestrator_url: String,
        orchestration_model_name: String,
        orchestrator_provider_name: String,
    ) -> Self {
        // Empty agent orchestrations - will be provided via usage_preferences in requests
        let agent_orchestrations: HashMap<String, Vec<OrchestrationPreference>> = HashMap::new();
@ -50,6 +55,7 @@ impl OrchestratorService {
            orchestrator_url,
            client: reqwest::Client::new(),
            orchestrator_model,
            orchestrator_provider_name,
        }
    }
@ -75,12 +81,12 @@ impl OrchestratorService {
        debug!(
            model = %self.orchestrator_model.get_model_name(),
            endpoint = %self.orchestrator_url,
-            "sending request to arch-orchestrator"
+            "sending request to plano-orchestrator"
        );
        debug!(
            body = %serde_json::to_string(&orchestrator_request).unwrap(),
-            "arch orchestrator request"
+            "plano orchestrator request"
        );
        let mut orchestration_request_headers = header::HeaderMap::new();
@ -91,7 +97,7 @@ impl OrchestratorService {
        orchestration_request_headers.insert(
            header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER),
-            header::HeaderValue::from_str(PLANO_ORCHESTRATOR_MODEL_NAME).unwrap(),
+            header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(),
        );
        // Inject OpenTelemetry trace context from current span
@ -110,7 +116,7 @@ impl OrchestratorService {
        orchestration_request_headers.insert(
            header::HeaderName::from_static("model"),
-            header::HeaderValue::from_static(PLANO_ORCHESTRATOR_MODEL_NAME),
+            header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(),
        );
        let start_time = std::time::Instant::now();
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -7,12 +7,6 @@ use crate::api::open_ai::{
    ChatCompletionTool, FunctionDefinition, FunctionParameter, FunctionParameters, ParameterType,
 };
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Routing {
    pub model_provider: Option<String>,
    pub model: Option<String>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ModelAlias {
    pub target: String,
@ -84,7 +78,6 @@ pub struct Configuration {
    pub ratelimits: Option<Vec<Ratelimit>>,
    pub tracing: Option<Tracing>,
    pub mode: Option<GatewayMode>,
    pub routing: Option<Routing>,
    pub agents: Option<Vec<Agent>>,
    pub filters: Option<Vec<Agent>>,
    pub listeners: Vec<Listener>,
@ -96,6 +89,8 @@ pub struct Overrides {
    pub prompt_target_intent_matching_threshold: Option<f64>,
    pub optimize_context_window: Option<bool>,
    pub use_agent_orchestrator: Option<bool>,
    pub llm_routing_model: Option<String>,
    pub agent_orchestration_model: Option<String>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -219,8 +214,6 @@ pub struct EmbeddingProviver {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
 pub enum LlmProviderType {
    #[serde(rename = "arch")]
    Arch,
    #[serde(rename = "anthropic")]
    Anthropic,
    #[serde(rename = "deepseek")]
@ -249,12 +242,13 @@ pub enum LlmProviderType {
    Qwen,
    #[serde(rename = "amazon_bedrock")]
    AmazonBedrock,
    #[serde(rename = "plano")]
    Plano,
 }
 impl Display for LlmProviderType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            LlmProviderType::Arch => write!(f, "arch"),
            LlmProviderType::Anthropic => write!(f, "anthropic"),
            LlmProviderType::Deepseek => write!(f, "deepseek"),
            LlmProviderType::Groq => write!(f, "groq"),
@ -269,6 +263,7 @@ impl Display for LlmProviderType {
            LlmProviderType::Zhipu => write!(f, "zhipu"),
            LlmProviderType::Qwen => write!(f, "qwen"),
            LlmProviderType::AmazonBedrock => write!(f, "amazon_bedrock"),
            LlmProviderType::Plano => write!(f, "plano"),
        }
    }
 }
@ -603,14 +598,14 @@ mod test {
            },
            LlmProvider {
                name: "arch-router".to_string(),
-                provider_interface: LlmProviderType::Arch,
+                provider_interface: LlmProviderType::Plano,
                model: Some("Arch-Router".to_string()),
                internal: Some(true),
                ..Default::default()
            },
            LlmProvider {
                name: "plano-orchestrator".to_string(),
-                provider_interface: LlmProviderType::Arch,
+                provider_interface: LlmProviderType::Plano,
                model: Some("Plano-Orchestrator".to_string()),
                internal: Some(true),
                ..Default::default()
--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@ -33,5 +33,4 @@ pub const OTEL_COLLECTOR_HTTP: &str = "opentelemetry_collector_http";
 pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route";
 pub const ENVOY_RETRY_HEADER: &str = "x-envoy-max-retries";
 pub const BRIGHT_STAFF_SERVICE_NAME: &str = "brightstaff";
-pub const PLANO_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
+pub const PLANO_FC_CLUSTER: &str = "plano";
 pub const ARCH_FC_CLUSTER: &str = "arch";
--- a/crates/hermesllm/src/bin/provider_models.yaml
+++ b/crates/hermesllm/src/bin/provider_models.yaml
@ -1,183 +1,16 @@
 version: '1.0'
 source: canonical-apis
 providers:
  mistralai:
  - mistralai/mistral-medium-2505
  - mistralai/mistral-medium-2508
  - mistralai/mistral-medium-latest
  - mistralai/mistral-medium
  - mistralai/mistral-vibe-cli-with-tools
  - mistralai/open-mistral-nemo
  - mistralai/open-mistral-nemo-2407
  - mistralai/mistral-tiny-2407
  - mistralai/mistral-tiny-latest
  - mistralai/mistral-large-2411
  - mistralai/pixtral-large-2411
  - mistralai/pixtral-large-latest
  - mistralai/mistral-large-pixtral-2411
  - mistralai/codestral-2508
  - mistralai/codestral-latest
  - mistralai/devstral-small-2507
  - mistralai/devstral-medium-2507
  - mistralai/devstral-2512
  - mistralai/mistral-vibe-cli-latest
  - mistralai/devstral-medium-latest
  - mistralai/devstral-latest
  - mistralai/labs-devstral-small-2512
  - mistralai/devstral-small-latest
  - mistralai/mistral-small-2506
  - mistralai/mistral-small-latest
  - mistralai/labs-mistral-small-creative
  - mistralai/magistral-medium-2509
  - mistralai/magistral-medium-latest
  - mistralai/magistral-small-2509
  - mistralai/magistral-small-latest
  - mistralai/mistral-large-2512
  - mistralai/mistral-large-latest
  - mistralai/ministral-3b-2512
  - mistralai/ministral-3b-latest
  - mistralai/ministral-8b-2512
  - mistralai/ministral-8b-latest
  - mistralai/ministral-14b-2512
  - mistralai/ministral-14b-latest
  - mistralai/mistral-small-2501
  - mistralai/mistral-embed-2312
  - mistralai/mistral-embed
  - mistralai/codestral-embed
  - mistralai/codestral-embed-2505
  openai:
  - openai/gpt-4-0613
  - openai/gpt-4
  - openai/gpt-3.5-turbo
  - openai/gpt-5.2-codex
  - openai/gpt-3.5-turbo-instruct
  - openai/gpt-3.5-turbo-instruct-0914
  - openai/gpt-4-1106-preview
  - openai/gpt-3.5-turbo-1106
  - openai/gpt-4-0125-preview
  - openai/gpt-4-turbo-preview
  - openai/gpt-3.5-turbo-0125
  - openai/gpt-4-turbo
  - openai/gpt-4-turbo-2024-04-09
  - openai/gpt-4o
  - openai/gpt-4o-2024-05-13
  - openai/gpt-4o-mini-2024-07-18
  - openai/gpt-4o-mini
  - openai/gpt-4o-2024-08-06
  - openai/chatgpt-4o-latest
  - openai/o1-2024-12-17
  - openai/o1
  - openai/computer-use-preview
  - openai/o3-mini
  - openai/o3-mini-2025-01-31
  - openai/gpt-4o-2024-11-20
  - openai/computer-use-preview-2025-03-11
  - openai/gpt-4o-search-preview-2025-03-11
  - openai/gpt-4o-search-preview
  - openai/gpt-4o-mini-search-preview-2025-03-11
  - openai/gpt-4o-mini-search-preview
  - openai/o1-pro-2025-03-19
  - openai/o1-pro
  - openai/o3-2025-04-16
  - openai/o4-mini-2025-04-16
  - openai/o3
  - openai/o4-mini
  - openai/gpt-4.1-2025-04-14
  - openai/gpt-4.1
  - openai/gpt-4.1-mini-2025-04-14
  - openai/gpt-4.1-mini
  - openai/gpt-4.1-nano-2025-04-14
  - openai/gpt-4.1-nano
  - openai/o3-pro
  - openai/o3-pro-2025-06-10
  - openai/o4-mini-deep-research
  - openai/o3-deep-research
  - openai/o3-deep-research-2025-06-26
  - openai/o4-mini-deep-research-2025-06-26
  - openai/gpt-5-chat-latest
  - openai/gpt-5-2025-08-07
  - openai/gpt-5
  - openai/gpt-5-mini-2025-08-07
  - openai/gpt-5-mini
  - openai/gpt-5-nano-2025-08-07
  - openai/gpt-5-nano
  - openai/gpt-5-codex
  - openai/gpt-5-pro-2025-10-06
  - openai/gpt-5-pro
  - openai/gpt-5-search-api
  - openai/gpt-5-search-api-2025-10-14
  - openai/gpt-5.1-chat-latest
  - openai/gpt-5.1-2025-11-13
  - openai/gpt-5.1
  - openai/gpt-5.1-codex
  - openai/gpt-5.1-codex-mini
  - openai/gpt-5.1-codex-max
  - openai/gpt-5.2-2025-12-11
  - openai/gpt-5.2
  - openai/gpt-5.2-pro-2025-12-11
  - openai/gpt-5.2-pro
  - openai/gpt-5.2-chat-latest
  - openai/gpt-3.5-turbo-16k
  - openai/ft:gpt-3.5-turbo-0613:katanemo::8CMZbm0P
  deepseek:
  - deepseek/deepseek-chat
  - deepseek/deepseek-reasoner
  x-ai:
  - x-ai/grok-2-vision-1212
  - x-ai/grok-3
  - x-ai/grok-3-mini
  - x-ai/grok-4-0709
  - x-ai/grok-4-1-fast-non-reasoning
  - x-ai/grok-4-1-fast-reasoning
  - x-ai/grok-4-fast-non-reasoning
  - x-ai/grok-4-fast-reasoning
  - x-ai/grok-code-fast-1
  - x-ai/grok-imagine-image
  - x-ai/grok-imagine-video
  moonshotai:
  - moonshotai/kimi-k2-thinking
  - moonshotai/kimi-k2.5
  - moonshotai/moonshot-v1-128k-vision-preview
  - moonshotai/moonshot-v1-8k
  - moonshotai/kimi-k2-turbo-preview
  - moonshotai/moonshot-v1-128k
  - moonshotai/moonshot-v1-32k-vision-preview
  - moonshotai/kimi-k2-thinking-turbo
  - moonshotai/kimi-latest
  - moonshotai/moonshot-v1-32k
  - moonshotai/moonshot-v1-auto
  - moonshotai/kimi-k2-0711-preview
  - moonshotai/kimi-k2-0905-preview
  - moonshotai/moonshot-v1-8k-vision-preview
  anthropic:
  - anthropic/claude-opus-4-6
  - anthropic/claude-opus-4-5-20251101
  - anthropic/claude-opus-4-5
  - anthropic/claude-haiku-4-5-20251001
  - anthropic/claude-haiku-4-5
  - anthropic/claude-sonnet-4-5-20250929
  - anthropic/claude-sonnet-4-5
  - anthropic/claude-opus-4-1-20250805
  - anthropic/claude-opus-4-1
  - anthropic/claude-opus-4-20250514
  - anthropic/claude-opus-4
  - anthropic/claude-sonnet-4-20250514
  - anthropic/claude-sonnet-4
  - anthropic/claude-3-7-sonnet-20250219
  - anthropic/claude-3-7-sonnet
  - anthropic/claude-3-5-haiku-20241022
  - anthropic/claude-3-5-haiku
  - anthropic/claude-3-haiku-20240307
  - anthropic/claude-3-haiku
  google:
  - google/gemini-2.5-flash
  - google/gemini-2.5-pro
  - google/gemini-2.0-flash
  - google/gemini-2.0-flash-001
  - google/gemini-2.0-flash-exp-image-generation
  - google/gemini-2.0-flash-lite-001
  - google/gemini-2.0-flash-lite
  - google/gemini-exp-1206
  - google/gemini-2.5-flash-preview-tts
  - google/gemini-2.5-pro-preview-tts
  - google/gemma-3-1b-it
@ -191,12 +24,15 @@ providers:
  - google/gemini-pro-latest
  - google/gemini-2.5-flash-lite
  - google/gemini-2.5-flash-image
  - google/gemini-2.5-flash-preview-09-2025
  - google/gemini-2.5-flash-lite-preview-09-2025
  - google/gemini-3-pro-preview
  - google/gemini-3-flash-preview
  - google/gemini-3.1-pro-preview
  - google/gemini-3.1-pro-preview-customtools
  - google/gemini-3.1-flash-lite-preview
  - google/gemini-3-pro-image-preview
  - google/nano-banana-pro-preview
  - google/gemini-3.1-flash-image-preview
  - google/gemini-robotics-er-1.5-preview
  - google/gemini-2.5-computer-use-preview-10-2025
  - google/deep-research-pro-preview-12-2025
@ -212,7 +48,37 @@ providers:
  - amazon/amazon.nova-premier-v1:0
  - amazon/amazon.nova-lite-v1:0
  - amazon/amazon.nova-micro-v1:0
  x-ai:
  - x-ai/grok-3
  - x-ai/grok-3-mini
  - x-ai/grok-4-0709
  - x-ai/grok-4-1-fast-non-reasoning
  - x-ai/grok-4-1-fast-reasoning
  - x-ai/grok-4-fast-non-reasoning
  - x-ai/grok-4-fast-reasoning
  - x-ai/grok-4.20-beta-0309-non-reasoning
  - x-ai/grok-4.20-beta-0309-reasoning
  - x-ai/grok-4.20-multi-agent-beta-0309
  - x-ai/grok-code-fast-1
  - x-ai/grok-imagine-image
  - x-ai/grok-imagine-video
  z-ai:
  - z-ai/glm-4.5
  - z-ai/glm-4.5-air
  - z-ai/glm-4.6
  - z-ai/glm-4.7
  - z-ai/glm-5
  qwen:
  - qwen/qwen3-asr-flash-2026-02-10
  - qwen/qwen3.5-flash-2026-02-23
  - qwen/qwen3.5-flash
  - qwen/qwen3.5-122b-a10b
  - qwen/qwen3.5-35b-a3b
  - qwen/qwen3.5-27b
  - qwen/qwen3-coder-next
  - qwen/qwen3.5-397b-a17b
  - qwen/qwen3.5-plus-2026-02-15
  - qwen/qwen3.5-plus
  - qwen/qwen3-vl-flash-2026-01-22
  - qwen/qwen3-max-2026-01-23
  - qwen/qwen-plus-character
@ -294,13 +160,161 @@ providers:
  - qwen/qwen-max
  - qwen/qwen-plus
  - qwen/qwen-turbo
-  z-ai:
+  mistralai:
-  - z-ai/glm-4.5
+  - mistralai/mistral-medium-2505
-  - z-ai/glm-4.5-air
+  - mistralai/mistral-medium-2508
-  - z-ai/glm-4.6
+  - mistralai/mistral-medium-latest
-  - z-ai/glm-4.7
+  - mistralai/mistral-medium
-  - z-ai/glm-5
+  - mistralai/mistral-vibe-cli-with-tools
  - mistralai/open-mistral-nemo
  - mistralai/open-mistral-nemo-2407
  - mistralai/mistral-tiny-2407
  - mistralai/mistral-tiny-latest
  - mistralai/codestral-2508
  - mistralai/codestral-latest
  - mistralai/devstral-2512
  - mistralai/mistral-vibe-cli-latest
  - mistralai/devstral-medium-latest
  - mistralai/devstral-latest
  - mistralai/mistral-small-2506
  - mistralai/mistral-small-latest
  - mistralai/labs-mistral-small-creative
  - mistralai/magistral-medium-2509
  - mistralai/magistral-medium-latest
  - mistralai/magistral-small-2509
  - mistralai/magistral-small-latest
  - mistralai/mistral-large-2512
  - mistralai/mistral-large-latest
  - mistralai/ministral-3b-2512
  - mistralai/ministral-3b-latest
  - mistralai/ministral-8b-2512
  - mistralai/ministral-8b-latest
  - mistralai/ministral-14b-2512
  - mistralai/ministral-14b-latest
  - mistralai/mistral-large-2411
  - mistralai/pixtral-large-2411
  - mistralai/pixtral-large-latest
  - mistralai/mistral-large-pixtral-2411
  - mistralai/devstral-small-2507
  - mistralai/devstral-medium-2507
  - mistralai/labs-devstral-small-2512
  - mistralai/devstral-small-latest
  - mistralai/mistral-squarepoint-2602
  - mistralai/mistral-embed-2312
  - mistralai/mistral-embed
  - mistralai/codestral-embed
  - mistralai/codestral-embed-2505
  moonshotai:
  - moonshotai/kimi-k2.5
  - moonshotai/kimi-k2-0905-preview
  - moonshotai/moonshot-v1-32k
  - moonshotai/moonshot-v1-128k
  - moonshotai/kimi-k2-thinking-turbo
  - moonshotai/moonshot-v1-8k-vision-preview
  - moonshotai/kimi-k2-0711-preview
  - moonshotai/moonshot-v1-auto
  - moonshotai/kimi-k2-thinking
  - moonshotai/moonshot-v1-128k-vision-preview
  - moonshotai/kimi-k2-turbo-preview
  - moonshotai/moonshot-v1-32k-vision-preview
  - moonshotai/moonshot-v1-8k
  anthropic:
  - anthropic/claude-sonnet-4-6
  - anthropic/claude-opus-4-6
  - anthropic/claude-opus-4-5-20251101
  - anthropic/claude-opus-4-5
  - anthropic/claude-haiku-4-5-20251001
  - anthropic/claude-haiku-4-5
  - anthropic/claude-sonnet-4-5-20250929
  - anthropic/claude-sonnet-4-5
  - anthropic/claude-opus-4-1-20250805
  - anthropic/claude-opus-4-1
  - anthropic/claude-opus-4-20250514
  - anthropic/claude-opus-4
  - anthropic/claude-sonnet-4-20250514
  - anthropic/claude-sonnet-4
  - anthropic/claude-3-haiku-20240307
  - anthropic/claude-3-haiku
  openai:
  - openai/gpt-4-0613
  - openai/gpt-4
  - openai/gpt-3.5-turbo
  - openai/gpt-5.4
  - openai/gpt-5.3-chat-latest
  - openai/gpt-5.4-2026-03-05
  - openai/gpt-5.4-pro
  - openai/gpt-5.4-pro-2026-03-05
  - openai/gpt-3.5-turbo-instruct
  - openai/gpt-3.5-turbo-instruct-0914
  - openai/gpt-4-1106-preview
  - openai/gpt-3.5-turbo-1106
  - openai/gpt-4-0125-preview
  - openai/gpt-4-turbo-preview
  - openai/gpt-3.5-turbo-0125
  - openai/gpt-4-turbo
  - openai/gpt-4-turbo-2024-04-09
  - openai/gpt-4o
  - openai/gpt-4o-2024-05-13
  - openai/gpt-4o-mini-2024-07-18
  - openai/gpt-4o-mini
  - openai/gpt-4o-2024-08-06
  - openai/o1-2024-12-17
  - openai/o1
  - openai/computer-use-preview
  - openai/o3-mini
  - openai/o3-mini-2025-01-31
  - openai/gpt-4o-2024-11-20
  - openai/computer-use-preview-2025-03-11
  - openai/gpt-4o-mini-search-preview-2025-03-11
  - openai/gpt-4o-mini-search-preview
  - openai/o1-pro-2025-03-19
  - openai/o1-pro
  - openai/o3-2025-04-16
  - openai/o4-mini-2025-04-16
  - openai/o3
  - openai/o4-mini
  - openai/gpt-4.1-2025-04-14
  - openai/gpt-4.1
  - openai/gpt-4.1-mini-2025-04-14
  - openai/gpt-4.1-mini
  - openai/gpt-4.1-nano-2025-04-14
  - openai/gpt-4.1-nano
  - openai/o3-pro
  - openai/o3-pro-2025-06-10
  - openai/o4-mini-deep-research
  - openai/o3-deep-research
  - openai/o3-deep-research-2025-06-26
  - openai/o4-mini-deep-research-2025-06-26
  - openai/gpt-5-chat-latest
  - openai/gpt-5-2025-08-07
  - openai/gpt-5
  - openai/gpt-5-mini-2025-08-07
  - openai/gpt-5-mini
  - openai/gpt-5-nano-2025-08-07
  - openai/gpt-5-nano
  - openai/gpt-5-codex
  - openai/gpt-5-pro-2025-10-06
  - openai/gpt-5-pro
  - openai/gpt-5-search-api
  - openai/gpt-5-search-api-2025-10-14
  - openai/gpt-5.1-chat-latest
  - openai/gpt-5.1-2025-11-13
  - openai/gpt-5.1
  - openai/gpt-5.1-codex
  - openai/gpt-5.1-codex-mini
  - openai/gpt-5.1-codex-max
  - openai/gpt-5.2-2025-12-11
  - openai/gpt-5.2
  - openai/gpt-5.2-pro-2025-12-11
  - openai/gpt-5.2-pro
  - openai/gpt-5.2-chat-latest
  - openai/gpt-5.2-codex
  - openai/gpt-5.3-codex
  - openai/gpt-4o-search-preview
  - openai/gpt-4o-search-preview-2025-03-11
  - openai/gpt-3.5-turbo-16k
  - openai/ft:gpt-3.5-turbo-0613:katanemo::8CMZbm0P
 metadata:
  total_providers: 10
-  total_models: 289
+  total_models: 303
-  last_updated: 2026-02-13T22:44:30.413065+00:00
+  last_updated: 2026-03-15T16:47:22.207197+00:00
--- a/crates/hermesllm/src/lib.rs
+++ b/crates/hermesllm/src/lib.rs
@ -35,7 +35,7 @@ mod tests {
            ProviderId::Mistral
        );
        assert_eq!(ProviderId::try_from("groq").unwrap(), ProviderId::Groq);
-        assert_eq!(ProviderId::try_from("arch").unwrap(), ProviderId::Arch);
+        assert_eq!(ProviderId::try_from("plano").unwrap(), ProviderId::Plano);
        // Test aliases
        assert_eq!(ProviderId::try_from("google").unwrap(), ProviderId::Gemini);
--- a/crates/hermesllm/src/providers/id.rs
+++ b/crates/hermesllm/src/providers/id.rs
@ -34,7 +34,7 @@ pub enum ProviderId {
    Gemini,
    Anthropic,
    GitHub,
-    Arch,
+    Plano,
    AzureOpenAI,
    XAI,
    TogetherAI,
@ -58,7 +58,7 @@ impl TryFrom<&str> for ProviderId {
            "google" => Ok(ProviderId::Gemini), // alias
            "anthropic" => Ok(ProviderId::Anthropic),
            "github" => Ok(ProviderId::GitHub),
-            "arch" => Ok(ProviderId::Arch),
+            "plano" => Ok(ProviderId::Plano),
            "azure_openai" => Ok(ProviderId::AzureOpenAI),
            "xai" => Ok(ProviderId::XAI),
            "together_ai" => Ok(ProviderId::TogetherAI),
@ -135,7 +135,7 @@ impl ProviderId {
                | ProviderId::Groq
                | ProviderId::Mistral
                | ProviderId::Deepseek
-                | ProviderId::Arch
+                | ProviderId::Plano
                | ProviderId::Gemini
                | ProviderId::GitHub
                | ProviderId::AzureOpenAI
@ -153,7 +153,7 @@ impl ProviderId {
                | ProviderId::Groq
                | ProviderId::Mistral
                | ProviderId::Deepseek
-                | ProviderId::Arch
+                | ProviderId::Plano
                | ProviderId::Gemini
                | ProviderId::GitHub
                | ProviderId::AzureOpenAI
@ -219,7 +219,7 @@ impl Display for ProviderId {
            ProviderId::Gemini => write!(f, "Gemini"),
            ProviderId::Anthropic => write!(f, "Anthropic"),
            ProviderId::GitHub => write!(f, "GitHub"),
-            ProviderId::Arch => write!(f, "Arch"),
+            ProviderId::Plano => write!(f, "Plano"),
            ProviderId::AzureOpenAI => write!(f, "azure_openai"),
            ProviderId::XAI => write!(f, "xai"),
            ProviderId::TogetherAI => write!(f, "together_ai"),
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -873,7 +873,7 @@ impl HttpContext for StreamContext {
                // ensure that the provider has an endpoint if the access key is missing else return a bad request
                if self.llm_provider.as_ref().unwrap().endpoint.is_none()
                    && self.llm_provider.as_ref().unwrap().provider_interface
-                        != LlmProviderType::Arch
+                        != LlmProviderType::Plano
                {
                    self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
                }
--- a/demos/agent_orchestration/travel_agents/README.md
+++ b/demos/agent_orchestration/travel_agents/README.md
@ -123,6 +123,42 @@ Each agent:
 Both agents run as native local processes and communicate with Plano running natively on the host.
 ## Running with local Plano-Orchestrator (via vLLM)
 By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model locally using vLLM on a server with an NVIDIA GPU:
 1. Install vLLM and download the model:
 ```bash
 pip install vllm
 ```
 2. Start the vLLM server with the 4B model:
 ```bash
 vllm serve katanemo/Plano-Orchestrator-4B \
    --host 0.0.0.0 \
    --port 8000 \
    --tensor-parallel-size 1 \
    --gpu-memory-utilization 0.3 \
    --tokenizer katanemo/Plano-Orchestrator-4B \
    --chat-template chat_template.jinja \
    --served-model-name katanemo/Plano-Orchestrator-4B \
    --enable-prefix-caching
 ```
 3. Start the demo with the local orchestrator config:
 ```bash
 ./run_demo.sh --local-orchestrator
 ```
 4. Test with curl:
 ```bash
 curl -X POST http://localhost:8001/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{"model": "gpt-5.2", "messages": [{"role": "user", "content": "What is the weather in Istanbul?"}]}'
 ```
 You should see Plano use your local orchestrator to route the request to the weather agent.
 ## Observability
 This demo includes full OpenTelemetry (OTel) compatible distributed tracing to monitor and debug agent interactions:
--- a/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml
+++ b/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml
@ -0,0 +1,66 @@
 version: v0.3.0
 overrides:
  agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B
 agents:
  - id: weather_agent
    url: http://localhost:10510
  - id: flight_agent
    url: http://localhost:10520
 model_providers:
  - model: plano/katanemo/Plano-Orchestrator-4B
    base_url: http://localhost:8000
  - model: openai/gpt-5.2
    access_key: $OPENAI_API_KEY
    default: true
  - model: openai/gpt-4o-mini
    access_key: $OPENAI_API_KEY # smaller, faster, cheaper model for extracting entities like location
 listeners:
  - type: agent
    name: travel_booking_service
    port: 8001
    router: plano_orchestrator_v1
    agents:
      - id: weather_agent
        description: |
          WeatherAgent is a specialized AI assistant for real-time weather information and forecasts. It provides accurate weather data for any city worldwide using the Open-Meteo API, helping travelers plan their trips with up-to-date weather conditions.
          Capabilities:
            * Get real-time weather conditions and multi-day forecasts for any city worldwide using Open-Meteo API (free, no API key needed)
            * Provides current temperature
            * Provides multi-day forecasts
            * Provides weather conditions
            * Provides sunrise/sunset times
            * Provides detailed weather information
            * Understands conversation context to resolve location references from previous messages
            * Handles weather-related questions including "What's the weather in [city]?", "What's the forecast for [city]?", "How's the weather in [city]?"
            * When queries include both weather and other travel questions (e.g., flights, currency), this agent answers ONLY the weather part
      - id: flight_agent
        description: |
          FlightAgent is an AI-powered tool specialized in providing live flight information between airports. It leverages the FlightAware AeroAPI to deliver real-time flight status, gate information, and delay updates.
          Capabilities:
            * Get live flight information between airports using FlightAware AeroAPI
            * Shows real-time flight status
            * Shows scheduled/estimated/actual departure and arrival times
            * Shows gate and terminal information
            * Shows delays
            * Shows aircraft type
            * Shows flight status
            * Automatically resolves city names to airport codes (IATA/ICAO)
            * Understands conversation context to infer origin/destination from follow-up questions
            * Handles flight-related questions including "What flights go from [city] to [city]?", "Do flights go to [city]?", "Are there direct flights from [city]?"
            * When queries include both flight and other travel questions (e.g., weather, currency), this agent answers ONLY the flight part
 tracing:
  random_sampling: 100
  span_attributes:
    header_prefixes:
      - x-acme-
--- a/demos/agent_orchestration/travel_agents/run_demo.sh
+++ b/demos/agent_orchestration/travel_agents/run_demo.sh
@ -31,8 +31,13 @@ start_demo() {
  fi
  # Step 4: Start Plano
-  echo "Starting Plano with config.yaml..."
+  PLANO_CONFIG="config.yaml"
-  planoai up config.yaml
+  if [ "$1" == "--local-orchestrator" ]; then
    PLANO_CONFIG="config_local_orchestrator.yaml"
    echo "Using local orchestrator config..."
  fi
  echo "Starting Plano with $PLANO_CONFIG..."
  planoai up "$PLANO_CONFIG"
  # Step 5: Start agents natively
  echo "Starting agents..."
--- a/demos/llm_routing/model_routing_service/README.md
+++ b/demos/llm_routing/model_routing_service/README.md
@ -1,6 +1,54 @@
 # Model Routing Service Demo
-This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select.
+Plano is an AI-native proxy and data plane for agentic apps — with built-in orchestration, safety, observability, and intelligent LLM routing.
 ```
 ┌───────────┐      ┌─────────────────────────────────┐      ┌──────────────┐
 │  Client   │ ───► │  Plano                          │ ───► │  OpenAI      │
 │  (any     │      │                                 │      │  Anthropic   │
 │  language)│      │  Arch-Router (1.5B model)       │      │  Any Provider│
 └───────────┘      │  analyzes intent → picks model  │      └──────────────┘
                   └─────────────────────────────────┘
 ```
 - **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover
 - **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request
 - **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code
 - **Runs anywhere** — single binary; self-host the router for full data privacy
 ## How Routing Works
 The entire routing configuration is plain YAML — no code:
 ```yaml
 model_providers:
  - model: openai/gpt-4o-mini
    default: true                    # fallback for unmatched requests
  - model: openai/gpt-4o
    routing_preferences:
      - name: complex_reasoning
        description: complex reasoning tasks, multi-step analysis
  - model: anthropic/claude-sonnet-4-20250514
    routing_preferences:
      - name: code_generation
        description: generating new code, writing functions
 ```
 When a request arrives, Plano sends the conversation and routing preferences to Arch-Router, which classifies the intent and returns the matching route:
 ```
 1. Request arrives          → "Write binary search in Python"
 2. Preferences serialized   → [{"name":"code_generation", ...}, {"name":"complex_reasoning", ...}]
 3. Arch-Router classifies   → {"route": "code_generation"}
 4. Route → Model lookup     → code_generation → anthropic/claude-sonnet-4-20250514
 5. Request forwarded        → Claude generates the response
 ```
 No match? Arch-Router returns `other` → Plano falls back to the default model.
 The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing and validating routing behavior before going to production.
 ## Setup
@ -55,6 +103,69 @@ Response:
 The response tells you which model would handle this request and which route was matched, without actually making the LLM call.
 ## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
 To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint:
 **0. Check your GPU node labels and taints**
 ```bash
 kubectl get nodes --show-labels | grep -i gpu
 kubectl get node <gpu-node-name> -o jsonpath='{.spec.taints}'
 ```
 GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment.yaml` includes a matching toleration. If you have multiple GPU node pools and need to pin to a specific one, uncomment and set the `nodeSelector` in `vllm-deployment.yaml` using the label for your cloud provider.
 **1. Deploy Arch-Router and Plano:**
 ```bash
 # arch-router deployment
 kubectl apply -f vllm-deployment.yaml
 # plano deployment
 kubectl create secret generic plano-secrets \
  --from-literal=OPENAI_API_KEY=$OPENAI_API_KEY \
  --from-literal=ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY
 kubectl create configmap plano-config \
  --from-file=plano_config.yaml=config_k8s.yaml \
  --dry-run=client -o yaml | kubectl apply -f -
 kubectl apply -f plano-deployment.yaml
 ```
 **3. Wait for both pods to be ready:**
 ```bash
 # Arch-Router downloads the model (~1 min) then vLLM loads it (~2 min)
 kubectl get pods -l app=arch-router -w
 kubectl rollout status deployment/plano
 ```
 **4. Test:**
 ```bash
 kubectl port-forward svc/plano 12000:12000
 ./demo.sh
 ```
 To confirm requests are hitting your in-cluster Arch-Router (not just health checks):
 ```bash
 kubectl logs -l app=arch-router -f --tail=0
 # Look for POST /v1/chat/completions entries
 ```
 **Updating the config:**
 ```bash
 kubectl create configmap plano-config \
  --from-file=plano_config.yaml=config_k8s.yaml \
  --dry-run=client -o yaml | kubectl apply -f -
 kubectl rollout restart deployment/plano
 ```
 ## Demo Output
 ```
--- a/demos/llm_routing/model_routing_service/config_k8s.yaml
+++ b/demos/llm_routing/model_routing_service/config_k8s.yaml
@ -0,0 +1,33 @@
 version: v0.3.0
 overrides:
  llm_routing_model: plano/Arch-Router
 listeners:
  - type: model
    name: model_listener
    port: 12000
 model_providers:
  - model: plano/Arch-Router
    base_url: http://arch-router:10000
  - model: openai/gpt-4o-mini
    access_key: $OPENAI_API_KEY
    default: true
  - model: openai/gpt-4o
    access_key: $OPENAI_API_KEY
    routing_preferences:
      - name: complex_reasoning
        description: complex reasoning tasks, multi-step analysis, or detailed explanations
  - model: anthropic/claude-sonnet-4-20250514
    access_key: $ANTHROPIC_API_KEY
    routing_preferences:
      - name: code_generation
        description: generating new code, writing functions, or creating boilerplate
 tracing:
  random_sampling: 100
--- a/demos/llm_routing/model_routing_service/plano-deployment.yaml
+++ b/demos/llm_routing/model_routing_service/plano-deployment.yaml
@ -0,0 +1,68 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: plano
  labels:
    app: plano
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: plano
  template:
    metadata:
      labels:
        app: plano
    spec:
      containers:
        - name: plano
          image: katanemo/plano:0.4.12
          ports:
            - containerPort: 12000  # LLM gateway (chat completions, model routing)
              name: llm-gateway
          envFrom:
            - secretRef:
                name: plano-secrets
          env:
            - name: LOG_LEVEL
              value: "info"
          volumeMounts:
            - name: plano-config
              mountPath: /app/plano_config.yaml
              subPath: plano_config.yaml
              readOnly: true
          readinessProbe:
            httpGet:
              path: /healthz
              port: 12000
            initialDelaySeconds: 5
            periodSeconds: 10
          livenessProbe:
            httpGet:
              path: /healthz
              port: 12000
            initialDelaySeconds: 10
            periodSeconds: 30
          resources:
            requests:
              memory: "256Mi"
              cpu: "250m"
            limits:
              memory: "512Mi"
              cpu: "1000m"
      volumes:
        - name: plano-config
          configMap:
            name: plano-config
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: plano
 spec:
  selector:
    app: plano
  ports:
    - name: llm-gateway
      port: 12000
      targetPort: 12000
--- a/demos/llm_routing/model_routing_service/test.rest
+++ b/demos/llm_routing/model_routing_service/test.rest
@ -0,0 +1,36 @@
 ### Code generation query (OpenAI format) — expects anthropic/claude-sonnet
 POST http://localhost:12000/routing/v1/chat/completions
 Content-Type: application/json
 {
  "model": "gpt-4o-mini",
  "messages": [{"role": "user", "content": "Write a Python function for binary search"}]
 }
 ### Complex reasoning query (OpenAI format) — expects openai/gpt-4o
 POST http://localhost:12000/routing/v1/chat/completions
 Content-Type: application/json
 {
  "model": "gpt-4o-mini",
  "messages": [{"role": "user", "content": "Analyze the trade-offs between microservices and monolithic architecture"}]
 }
 ### Simple query — no routing match, expects default model
 POST http://localhost:12000/routing/v1/chat/completions
 Content-Type: application/json
 {
  "model": "gpt-4o-mini",
  "messages": [{"role": "user", "content": "Hello"}]
 }
 ### Code generation query (Anthropic format)
 POST http://localhost:12000/routing/v1/messages
 Content-Type: application/json
 {
  "model": "claude-sonnet-4-20250514",
  "max_tokens": 1024,
  "messages": [{"role": "user", "content": "Write a REST API in Go using Gin"}]
 }
--- a/demos/llm_routing/model_routing_service/vllm-deployment.yaml
+++ b/demos/llm_routing/model_routing_service/vllm-deployment.yaml
@ -0,0 +1,104 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: arch-router
  labels:
    app: arch-router
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: arch-router
  template:
    metadata:
      labels:
        app: arch-router
    spec:
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      # Optional: add a nodeSelector to pin to a specific GPU node pool.
      # The nvidia.com/gpu resource request below is sufficient for most clusters.
      # nodeSelector:
      #   DigitalOcean: doks.digitalocean.com/gpu-model: l40s
      #   GKE:          cloud.google.com/gke-accelerator: nvidia-l4
      #   EKS:          eks.amazonaws.com/nodegroup: gpu-nodes
      #   AKS:          kubernetes.azure.com/agentpool: gpupool
      initContainers:
        - name: download-model
          image: python:3.11-slim
          command:
            - sh
            - -c
            - |
              pip install huggingface_hub[cli] && \
              python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')"
          volumeMounts:
            - name: model-cache
              mountPath: /models
      containers:
        - name: vllm
          image: vllm/vllm-openai:latest
          command:
            - vllm
            - serve
            - /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf
            - "--host"
            - "0.0.0.0"
            - "--port"
            - "10000"
            - "--load-format"
            - "gguf"
            - "--tokenizer"
            - "katanemo/Arch-Router-1.5B"
            - "--served-model-name"
            - "Arch-Router"
            - "--gpu-memory-utilization"
            - "0.3"
            - "--tensor-parallel-size"
            - "1"
            - "--enable-prefix-caching"
          ports:
            - name: http
              containerPort: 10000
              protocol: TCP
          resources:
            requests:
              cpu: "1"
              memory: "4Gi"
              nvidia.com/gpu: "1"
            limits:
              cpu: "4"
              memory: "8Gi"
              nvidia.com/gpu: "1"
          volumeMounts:
            - name: model-cache
              mountPath: /models
          readinessProbe:
            httpGet:
              path: /health
              port: 10000
            initialDelaySeconds: 60
            periodSeconds: 10
          livenessProbe:
            httpGet:
              path: /health
              port: 10000
            initialDelaySeconds: 180
            periodSeconds: 30
      volumes:
        - name: model-cache
          emptyDir: {}
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: arch-router
 spec:
  selector:
    app: arch-router
  ports:
    - name: http
      port: 10000
      targetPort: 10000
--- a/demos/llm_routing/openclaw_routing/config.yaml
+++ b/demos/llm_routing/openclaw_routing/config.yaml
@ -1,8 +1,7 @@
 version: v0.1.0
-routing:
+overrides:
-  model: Arch-Router
+  llm_routing_model: Arch-Router
  llm_provider: arch-router
 listeners:
  egress_traffic:
--- a/demos/llm_routing/preference_based_routing/plano_config_local.yaml
+++ b/demos/llm_routing/preference_based_routing/plano_config_local.yaml
@ -1,8 +1,7 @@
 version: v0.3.0
-routing:
+overrides:
-  model: Arch-Router
+  llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
  llm_provider: arch-router
 listeners:
  - type: model
@ -11,8 +10,7 @@ listeners:
 model_providers:
-  - name: arch-router
+  - model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
    model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
    base_url: http://localhost:11434
  - model: openai/gpt-4o-mini
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons
 project = "Plano Docs"
 copyright = "2025, Katanemo Labs, Inc"
 author = "Katanemo Labs, Inc"
-release = " v0.4.11"
+release = " v0.4.12"
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@ -43,7 +43,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
 .. code-block:: console
-   $ uv tool install planoai==0.4.11
+   $ uv tool install planoai==0.4.12
 **Option 2: Install with pip (Traditional)**
@ -51,7 +51,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
   $ python -m venv venv
   $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-   $ pip install planoai==0.4.11
+   $ pip install planoai==0.4.12
 .. _llm_routing_quickstart:
--- a/docs/source/guides/llm_router.rst
+++ b/docs/source/guides/llm_router.rst
@ -253,13 +253,11 @@ Using Ollama (recommended for local development)
   .. code-block:: yaml
-       routing:
+       overrides:
-         model: Arch-Router
+         llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
         llm_provider: arch-router
       model_providers:
-         - name: arch-router
+         - model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
           model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
           base_url: http://localhost:11434
         - model: openai/gpt-5.2
@ -324,13 +322,11 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
   .. code-block:: yaml
-       routing:
+       overrides:
-         model: Arch-Router
+         llm_routing_model: plano/Arch-Router
         llm_provider: arch-router
       model_providers:
-         - name: arch-router
+         - model: plano/Arch-Router
           model: Arch-Router
           base_url: http://<your-server-ip>:10000
         - model: openai/gpt-5.2
@ -351,6 +347,35 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
       curl http://localhost:10000/v1/models
 Using vLLM on Kubernetes (GPU nodes)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 For teams running Kubernetes, Arch-Router and Plano can be deployed as in-cluster services.
 The ``demos/llm_routing/model_routing_service/`` directory includes ready-to-use manifests:
 - ``vllm-deployment.yaml`` — Arch-Router served by vLLM, with an init container to download
  the model from HuggingFace
 - ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Arch-Router
 - ``config_k8s.yaml`` — Plano config with ``llm_routing_model`` pointing at
  ``http://arch-router:10000`` instead of the default hosted endpoint
 Key things to know before deploying:
 - GPU nodes commonly have a ``nvidia.com/gpu:NoSchedule`` taint — the ``vllm-deployment.yaml``
  includes a matching toleration. The ``nvidia.com/gpu: "1"`` resource request is sufficient
  for scheduling in most clusters; a ``nodeSelector`` is optional and commented out in the
  manifest for cases where you need to pin to a specific GPU node pool.
 - Model download takes ~1 minute; vLLM loads the model in ~1-2 minutes after that. The
  ``livenessProbe`` has a 180-second ``initialDelaySeconds`` to avoid premature restarts.
 - The Plano config ConfigMap must use ``--from-file=plano_config.yaml=config_k8s.yaml`` with
  ``subPath`` in the Deployment — omitting ``subPath`` causes Kubernetes to mount a directory
  instead of a file.
 For the canonical Plano Kubernetes deployment (ConfigMap, Secrets, Deployment YAML), see
 :ref:`deployment`. For full step-by-step commands specific to this demo, see the
 `demo README <https://github.com/katanemo/plano/tree/main/demos/llm_routing/model_routing_service/README.md>`_.
 Combining Routing Methods
 -------------------------
--- a/docs/source/guides/orchestration.rst
+++ b/docs/source/guides/orchestration.rst
@ -335,6 +335,90 @@ Combine RAG agents for documentation lookup with specialized troubleshooting age
      - id: troubleshoot_agent
        description: Diagnoses and resolves technical issues step by step
 Self-hosting Plano-Orchestrator
 -------------------------------
 By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model, you can serve it using **vLLM** on a server with an NVIDIA GPU.
 .. note::
   vLLM requires a Linux server with an NVIDIA GPU (CUDA). For local development on macOS, a GGUF version for Ollama is coming soon.
 The following model variants are available on HuggingFace:
 * `Plano-Orchestrator-4B <https://huggingface.co/katanemo/Plano-Orchestrator-4B>`_ — lighter model, suitable for development and testing
 * `Plano-Orchestrator-4B-FP8 <https://huggingface.co/katanemo/Plano-Orchestrator-4B-FP8>`_ — FP8 quantized 4B model, lower memory usage
 * `Plano-Orchestrator-30B-A3B <https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B>`_ — full-size model for production
 * `Plano-Orchestrator-30B-A3B-FP8 <https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B-FP8>`_ — FP8 quantized 30B model, recommended for production deployments
 Using vLLM
 ~~~~~~~~~~
 1. **Install vLLM**
   .. code-block:: bash
       pip install vllm
 2. **Download the model and chat template**
   .. code-block:: bash
       pip install huggingface_hub
       huggingface-cli download katanemo/Plano-Orchestrator-4B
 3. **Start the vLLM server**
   For the 4B model (development):
   .. code-block:: bash
       vllm serve katanemo/Plano-Orchestrator-4B \
           --host 0.0.0.0 \
           --port 8000 \
           --tensor-parallel-size 1 \
           --gpu-memory-utilization 0.3 \
           --tokenizer katanemo/Plano-Orchestrator-4B \
           --chat-template chat_template.jinja \
           --served-model-name katanemo/Plano-Orchestrator-4B \
           --enable-prefix-caching
   For the 30B-A3B-FP8 model (production):
   .. code-block:: bash
       vllm serve katanemo/Plano-Orchestrator-30B-A3B-FP8 \
           --host 0.0.0.0 \
           --port 8000 \
           --tensor-parallel-size 1 \
           --gpu-memory-utilization 0.9 \
           --tokenizer katanemo/Plano-Orchestrator-30B-A3B-FP8 \
           --chat-template chat_template.jinja \
           --max-model-len 32768 \
           --served-model-name katanemo/Plano-Orchestrator-30B-A3B-FP8 \
           --enable-prefix-caching
 4. **Configure Plano to use the local orchestrator**
   Use the model name matching your ``--served-model-name``:
   .. code-block:: yaml
       overrides:
         agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B
       model_providers:
         - model: katanemo/Plano-Orchestrator-4B
           provider_interface: plano
           base_url: http://<your-server-ip>:8000
 5. **Verify the server is running**
   .. code-block:: bash
       curl http://localhost:8000/health
       curl http://localhost:8000/v1/models
 Next Steps
 ----------
--- a/docs/source/resources/deployment.rst
+++ b/docs/source/resources/deployment.rst
@ -65,7 +65,7 @@ Create a ``docker-compose.yml`` file with the following configuration:
   # docker-compose.yml
   services:
     plano:
-       image: katanemo/plano:0.4.11
+       image: katanemo/plano:0.4.12
       container_name: plano
       ports:
         - "10000:10000" # ingress (client -> plano)
@ -153,7 +153,7 @@ Create a ``plano-deployment.yaml``:
       spec:
         containers:
           - name: plano
-             image: katanemo/plano:0.4.11
+             image: katanemo/plano:0.4.12
             ports:
               - containerPort: 12000  # LLM gateway (chat completions, model routing)
                 name: llm-gateway
--- a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
+++ b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml
@ -107,11 +107,11 @@ model_providers:
 - internal: true
  model: Arch-Function
  name: arch-function
-  provider_interface: arch
+  provider_interface: plano
 - internal: true
  model: Plano-Orchestrator
-  name: plano-orchestrator
+  name: plano/orchestrator
-  provider_interface: arch
+  provider_interface: plano
 prompt_targets:
 - description: Get current weather at a location.
  endpoint:
`@ -1 +1 @@`
	`docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.11`	`docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.12`
`@ -1,3 +1,3 @@`
	`"""Plano CLI - Intelligent Prompt Gateway."""`	`"""Plano CLI - Intelligent Prompt Gateway."""`

	`__version__ = "0.4.11"`	`__version__ = "0.4.12"`