diff --git a/.claude/skills/release/SKILL.md b/.claude/skills/release/SKILL.md index 80510004..ba101bd3 100644 --- a/.claude/skills/release/SKILL.md +++ b/.claude/skills/release/SKILL.md @@ -25,4 +25,6 @@ Update the version string in ALL of these files: Do NOT change version strings in `*.lock` files or `Cargo.lock`. +After updating all version strings, run `cd cli && uv lock` to update the lock file with the new version. + After making changes, show a summary of all files modified and the old → new version. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 25e6f99d..01d5c33f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -133,13 +133,13 @@ jobs: load: true tags: | ${{ env.PLANO_DOCKER_IMAGE }} - ${{ env.DOCKER_IMAGE }}:0.4.11 + ${{ env.DOCKER_IMAGE }}:0.4.12 ${{ env.DOCKER_IMAGE }}:latest cache-from: type=gha cache-to: type=gha,mode=max - name: Save image as artifact - run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.11 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar + run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.12 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar - name: Upload image artifact uses: actions/upload-artifact@v6 diff --git a/.gitignore b/.gitignore index af706ea4..391c17fa 100644 --- a/.gitignore +++ b/.gitignore @@ -152,3 +152,4 @@ apps/*/dist/ .cursor/ .agents +docs/do/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 84001c45..22a18416 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,7 @@ repos: hooks: - id: check-yaml exclude: config/envoy.template* + args: [--allow-multiple-documents] - id: end-of-file-fixer - id: trailing-whitespace - repo: local diff --git a/apps/www/src/components/Hero.tsx b/apps/www/src/components/Hero.tsx index 7952c68f..fcfe5f01 100644 --- a/apps/www/src/components/Hero.tsx +++ b/apps/www/src/components/Hero.tsx @@ -24,7 +24,7 @@ export function Hero() { >
- v0.4.11 + v0.4.12 — diff --git a/build_filter_image.sh b/build_filter_image.sh index 8e041894..15d3d10e 100644 --- a/build_filter_image.sh +++ b/build_filter_image.sh @@ -1 +1 @@ -docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.11 +docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.12 diff --git a/cli/planoai/__init__.py b/cli/planoai/__init__.py index b94eadc2..e69352e8 100644 --- a/cli/planoai/__init__.py +++ b/cli/planoai/__init__.py @@ -1,3 +1,3 @@ """Plano CLI - Intelligent Prompt Gateway.""" -__version__ = "0.4.11" +__version__ = "0.4.12" diff --git a/cli/planoai/config_generator.py b/cli/planoai/config_generator.py index a4f9eb21..929b7657 100644 --- a/cli/planoai/config_generator.py +++ b/cli/planoai/config_generator.py @@ -3,18 +3,17 @@ import os from planoai.utils import convert_legacy_listeners from jinja2 import Environment, FileSystemLoader import yaml -from jsonschema import validate +from jsonschema import validate, ValidationError from urllib.parse import urlparse from copy import deepcopy from planoai.consts import DEFAULT_OTEL_TRACING_GRPC_ENDPOINT - SUPPORTED_PROVIDERS_WITH_BASE_URL = [ "azure_openai", "ollama", "qwen", "amazon_bedrock", - "arch", + "plano", ] SUPPORTED_PROVIDERS_WITHOUT_BASE_URL = [ @@ -368,47 +367,52 @@ def validate_and_render_schema(): llms_with_endpoint.append(model_provider) llms_with_endpoint_cluster_names.add(cluster_name) - if len(model_usage_name_keys) > 0: - routing_model_provider = config_yaml.get("routing", {}).get( - "model_provider", None + overrides_config = config_yaml.get("overrides", {}) + # Build lookup of model names (already prefix-stripped by config processing) + model_name_set = {mp.get("model") for mp in updated_model_providers} + + # Auto-add arch-router provider if routing preferences exist and no provider matches the router model + router_model = overrides_config.get("llm_routing_model", "Arch-Router") + # Strip provider prefix for comparison since config processing strips prefixes from model names + router_model_id = ( + router_model.split("/", 1)[1] if "/" in router_model else router_model + ) + if len(model_usage_name_keys) > 0 and router_model_id not in model_name_set: + updated_model_providers.append( + { + "name": "arch-router", + "provider_interface": "plano", + "model": router_model_id, + "internal": True, + } ) - if ( - routing_model_provider - and routing_model_provider not in model_provider_name_set - ): - raise Exception( - f"Routing model_provider {routing_model_provider} is not defined in model_providers" - ) - if ( - routing_model_provider is None - and "arch-router" not in model_provider_name_set - ): - updated_model_providers.append( - { - "name": "arch-router", - "provider_interface": "arch", - "model": config_yaml.get("routing", {}).get("model", "Arch-Router"), - "internal": True, - } - ) # Always add arch-function model provider if not already defined if "arch-function" not in model_provider_name_set: updated_model_providers.append( { "name": "arch-function", - "provider_interface": "arch", + "provider_interface": "plano", "model": "Arch-Function", "internal": True, } ) - if "plano-orchestrator" not in model_provider_name_set: + # Auto-add plano-orchestrator provider if no provider matches the orchestrator model + orchestrator_model = overrides_config.get( + "agent_orchestration_model", "Plano-Orchestrator" + ) + orchestrator_model_id = ( + orchestrator_model.split("/", 1)[1] + if "/" in orchestrator_model + else orchestrator_model + ) + if orchestrator_model_id not in model_name_set: updated_model_providers.append( { - "name": "plano-orchestrator", - "provider_interface": "arch", - "model": "Plano-Orchestrator", + "name": "plano/orchestrator", + "provider_interface": "plano", + "model": orchestrator_model_id, "internal": True, } ) @@ -513,11 +517,15 @@ def validate_prompt_config(plano_config_file, plano_config_schema_file): try: validate(config_yaml, config_schema_yaml) - except Exception as e: - print( - f"Error validating plano_config file: {plano_config_file}, schema file: {plano_config_schema_file}, error: {e}" + except ValidationError as e: + path = ( + " → ".join(str(p) for p in e.absolute_path) if e.absolute_path else "root" ) - raise e + raise ValidationError( + f"{e.message}\n Location: {path}\n Value: {e.instance}" + ) from None + except Exception as e: + raise if __name__ == "__main__": diff --git a/cli/planoai/consts.py b/cli/planoai/consts.py index 145fb640..9c330caa 100644 --- a/cli/planoai/consts.py +++ b/cli/planoai/consts.py @@ -5,7 +5,7 @@ PLANO_COLOR = "#969FF4" SERVICE_NAME_ARCHGW = "plano" PLANO_DOCKER_NAME = "plano" -PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.11") +PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.12") DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317" # Native mode constants diff --git a/cli/planoai/native_runner.py b/cli/planoai/native_runner.py index 0e39a1fd..ed44e8ad 100644 --- a/cli/planoai/native_runner.py +++ b/cli/planoai/native_runner.py @@ -420,9 +420,16 @@ def native_validate_config(plano_config_file): with _temporary_env(overrides): from planoai.config_generator import validate_and_render_schema - # Suppress verbose print output from config_generator - with contextlib.redirect_stdout(io.StringIO()): - validate_and_render_schema() + # Suppress verbose print output from config_generator but capture errors + captured = io.StringIO() + try: + with contextlib.redirect_stdout(captured): + validate_and_render_schema() + except SystemExit: + # validate_and_render_schema calls exit(1) on failure after + # printing to stdout; re-raise so the caller gets a useful message. + output = captured.getvalue().strip() + raise Exception(output) if output else Exception("Config validation failed") def native_logs(debug=False, follow=False): diff --git a/cli/pyproject.toml b/cli/pyproject.toml index 3f9be272..25cc81a4 100644 --- a/cli/pyproject.toml +++ b/cli/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "planoai" -version = "0.4.11" +version = "0.4.12" description = "Python-based CLI tool to manage Plano." authors = [{name = "Katanemo Labs, Inc."}] readme = "README.md" diff --git a/cli/uv.lock b/cli/uv.lock index 9d85bf85..dfca2484 100644 --- a/cli/uv.lock +++ b/cli/uv.lock @@ -337,7 +337,7 @@ wheels = [ [[package]] name = "planoai" -version = "0.4.9" +version = "0.4.12" source = { editable = "." } dependencies = [ { name = "click" }, diff --git a/config/envoy.template.yaml b/config/envoy.template.yaml index a780c3f1..c2dd5ed0 100644 --- a/config/envoy.template.yaml +++ b/config/envoy.template.yaml @@ -594,13 +594,13 @@ static_resources: clusters: - - name: arch + - name: plano connect_timeout: {{ upstream_connect_timeout | default('5s') }} type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: - cluster_name: arch + cluster_name: plano endpoints: - lb_endpoints: - endpoint: diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index b65fdb17..e0a6eef1 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -181,7 +181,7 @@ properties: provider_interface: type: string enum: - - arch + - plano - claude - deepseek - groq @@ -228,7 +228,7 @@ properties: provider_interface: type: string enum: - - arch + - plano - claude - deepseek - groq @@ -279,6 +279,12 @@ properties: upstream_tls_ca_path: type: string description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'." + llm_routing_model: + type: string + description: "Model name for the LLM router (e.g., 'Arch-Router'). Must match a model in model_providers." + agent_orchestration_model: + type: string + description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers." system_prompt: type: string prompt_targets: @@ -416,14 +422,6 @@ properties: enum: - llm - prompt - routing: - type: object - properties: - llm_provider: - type: string - model: - type: string - additionalProperties: false state_storage: type: object properties: diff --git a/crates/brightstaff/src/handlers/agent_selector.rs b/crates/brightstaff/src/handlers/agent_selector.rs index 33cf73ff..0c9b018e 100644 --- a/crates/brightstaff/src/handlers/agent_selector.rs +++ b/crates/brightstaff/src/handlers/agent_selector.rs @@ -178,6 +178,7 @@ mod tests { Arc::new(OrchestratorService::new( "http://localhost:8080".to_string(), "test-model".to_string(), + "plano-orchestrator".to_string(), )) } diff --git a/crates/brightstaff/src/handlers/integration_tests.rs b/crates/brightstaff/src/handlers/integration_tests.rs index 8013ed0a..c3153d3d 100644 --- a/crates/brightstaff/src/handlers/integration_tests.rs +++ b/crates/brightstaff/src/handlers/integration_tests.rs @@ -23,6 +23,7 @@ mod tests { Arc::new(OrchestratorService::new( "http://localhost:8080".to_string(), "test-model".to_string(), + "plano-orchestrator".to_string(), )) } diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index c8e34002..96a66c60 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -11,9 +11,7 @@ use brightstaff::state::StateStorage; use brightstaff::utils::tracing::init_tracer; use bytes::Bytes; use common::configuration::{Agent, Configuration, ListenerType}; -use common::consts::{ - CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH, PLANO_ORCHESTRATOR_MODEL_NAME, -}; +use common::consts::{CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH}; use common::llm_providers::LlmProviders; use http_body_util::{combinators::BoxBody, BodyExt, Empty}; use hyper::body::Incoming; @@ -36,6 +34,8 @@ pub mod router; const BIND_ADDRESS: &str = "0.0.0.0:9091"; const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router"; const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router"; +const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator"; +const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator"; // Utility function to extract the context from the incoming request headers fn extract_context_from_request(req: &Request) -> Context { @@ -139,16 +139,21 @@ async fn main() -> Result<(), Box> { env::var("LLM_PROVIDER_ENDPOINT").unwrap_or_else(|_| "http://localhost:12001".to_string()); let listener = TcpListener::bind(bind_address).await?; - let routing_model_name: String = plano_config - .routing - .as_ref() - .and_then(|r| r.model.clone()) - .unwrap_or_else(|| DEFAULT_ROUTING_MODEL_NAME.to_string()); + let overrides = plano_config.overrides.clone().unwrap_or_default(); + + // Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests + let routing_model_name: String = overrides + .llm_routing_model + .as_deref() + .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m)) + .unwrap_or(DEFAULT_ROUTING_MODEL_NAME) + .to_string(); let routing_llm_provider = plano_config - .routing - .as_ref() - .and_then(|r| r.model_provider.clone()) + .model_providers + .iter() + .find(|p| p.model.as_deref() == Some(routing_model_name.as_str())) + .map(|p| p.name.clone()) .unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string()); let router_service: Arc = Arc::new(RouterService::new( @@ -158,9 +163,25 @@ async fn main() -> Result<(), Box> { routing_llm_provider, )); + // Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests + let orchestrator_model_name: String = overrides + .agent_orchestration_model + .as_deref() + .map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m)) + .unwrap_or(DEFAULT_ORCHESTRATOR_MODEL_NAME) + .to_string(); + + let orchestrator_llm_provider: String = plano_config + .model_providers + .iter() + .find(|p| p.model.as_deref() == Some(orchestrator_model_name.as_str())) + .map(|p| p.name.clone()) + .unwrap_or_else(|| DEFAULT_ORCHESTRATOR_LLM_PROVIDER.to_string()); + let orchestrator_service: Arc = Arc::new(OrchestratorService::new( format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"), - PLANO_ORCHESTRATOR_MODEL_NAME.to_string(), + orchestrator_model_name, + orchestrator_llm_provider, )); let model_aliases = Arc::new(plano_config.model_aliases.clone()); diff --git a/crates/brightstaff/src/router/plano_orchestrator.rs b/crates/brightstaff/src/router/plano_orchestrator.rs index cf2688b9..12140570 100644 --- a/crates/brightstaff/src/router/plano_orchestrator.rs +++ b/crates/brightstaff/src/router/plano_orchestrator.rs @@ -2,7 +2,7 @@ use std::{collections::HashMap, sync::Arc}; use common::{ configuration::{AgentUsagePreference, OrchestrationPreference}, - consts::{ARCH_PROVIDER_HINT_HEADER, PLANO_ORCHESTRATOR_MODEL_NAME, REQUEST_ID_HEADER}, + consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER}, }; use hermesllm::apis::openai::{ChatCompletionsResponse, Message}; use hyper::header; @@ -19,6 +19,7 @@ pub struct OrchestratorService { orchestrator_url: String, client: reqwest::Client, orchestrator_model: Arc, + orchestrator_provider_name: String, } #[derive(Debug, Error)] @@ -36,7 +37,11 @@ pub enum OrchestrationError { pub type Result = std::result::Result; impl OrchestratorService { - pub fn new(orchestrator_url: String, orchestration_model_name: String) -> Self { + pub fn new( + orchestrator_url: String, + orchestration_model_name: String, + orchestrator_provider_name: String, + ) -> Self { // Empty agent orchestrations - will be provided via usage_preferences in requests let agent_orchestrations: HashMap> = HashMap::new(); @@ -50,6 +55,7 @@ impl OrchestratorService { orchestrator_url, client: reqwest::Client::new(), orchestrator_model, + orchestrator_provider_name, } } @@ -75,12 +81,12 @@ impl OrchestratorService { debug!( model = %self.orchestrator_model.get_model_name(), endpoint = %self.orchestrator_url, - "sending request to arch-orchestrator" + "sending request to plano-orchestrator" ); debug!( body = %serde_json::to_string(&orchestrator_request).unwrap(), - "arch orchestrator request" + "plano orchestrator request" ); let mut orchestration_request_headers = header::HeaderMap::new(); @@ -91,7 +97,7 @@ impl OrchestratorService { orchestration_request_headers.insert( header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER), - header::HeaderValue::from_str(PLANO_ORCHESTRATOR_MODEL_NAME).unwrap(), + header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(), ); // Inject OpenTelemetry trace context from current span @@ -110,7 +116,7 @@ impl OrchestratorService { orchestration_request_headers.insert( header::HeaderName::from_static("model"), - header::HeaderValue::from_static(PLANO_ORCHESTRATOR_MODEL_NAME), + header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(), ); let start_time = std::time::Instant::now(); diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 3050eac0..30187dd8 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -7,12 +7,6 @@ use crate::api::open_ai::{ ChatCompletionTool, FunctionDefinition, FunctionParameter, FunctionParameters, ParameterType, }; -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Routing { - pub model_provider: Option, - pub model: Option, -} - #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ModelAlias { pub target: String, @@ -84,7 +78,6 @@ pub struct Configuration { pub ratelimits: Option>, pub tracing: Option, pub mode: Option, - pub routing: Option, pub agents: Option>, pub filters: Option>, pub listeners: Vec, @@ -96,6 +89,8 @@ pub struct Overrides { pub prompt_target_intent_matching_threshold: Option, pub optimize_context_window: Option, pub use_agent_orchestrator: Option, + pub llm_routing_model: Option, + pub agent_orchestration_model: Option, } #[derive(Debug, Clone, Serialize, Deserialize, Default)] @@ -219,8 +214,6 @@ pub struct EmbeddingProviver { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] pub enum LlmProviderType { - #[serde(rename = "arch")] - Arch, #[serde(rename = "anthropic")] Anthropic, #[serde(rename = "deepseek")] @@ -249,12 +242,13 @@ pub enum LlmProviderType { Qwen, #[serde(rename = "amazon_bedrock")] AmazonBedrock, + #[serde(rename = "plano")] + Plano, } impl Display for LlmProviderType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - LlmProviderType::Arch => write!(f, "arch"), LlmProviderType::Anthropic => write!(f, "anthropic"), LlmProviderType::Deepseek => write!(f, "deepseek"), LlmProviderType::Groq => write!(f, "groq"), @@ -269,6 +263,7 @@ impl Display for LlmProviderType { LlmProviderType::Zhipu => write!(f, "zhipu"), LlmProviderType::Qwen => write!(f, "qwen"), LlmProviderType::AmazonBedrock => write!(f, "amazon_bedrock"), + LlmProviderType::Plano => write!(f, "plano"), } } } @@ -603,14 +598,14 @@ mod test { }, LlmProvider { name: "arch-router".to_string(), - provider_interface: LlmProviderType::Arch, + provider_interface: LlmProviderType::Plano, model: Some("Arch-Router".to_string()), internal: Some(true), ..Default::default() }, LlmProvider { name: "plano-orchestrator".to_string(), - provider_interface: LlmProviderType::Arch, + provider_interface: LlmProviderType::Plano, model: Some("Plano-Orchestrator".to_string()), internal: Some(true), ..Default::default() diff --git a/crates/common/src/consts.rs b/crates/common/src/consts.rs index cafc8e80..dbd0bc41 100644 --- a/crates/common/src/consts.rs +++ b/crates/common/src/consts.rs @@ -33,5 +33,4 @@ pub const OTEL_COLLECTOR_HTTP: &str = "opentelemetry_collector_http"; pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route"; pub const ENVOY_RETRY_HEADER: &str = "x-envoy-max-retries"; pub const BRIGHT_STAFF_SERVICE_NAME: &str = "brightstaff"; -pub const PLANO_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator"; -pub const ARCH_FC_CLUSTER: &str = "arch"; +pub const PLANO_FC_CLUSTER: &str = "plano"; diff --git a/crates/hermesllm/src/bin/provider_models.yaml b/crates/hermesllm/src/bin/provider_models.yaml index a748e7a7..a1425d43 100644 --- a/crates/hermesllm/src/bin/provider_models.yaml +++ b/crates/hermesllm/src/bin/provider_models.yaml @@ -1,183 +1,16 @@ version: '1.0' source: canonical-apis providers: - mistralai: - - mistralai/mistral-medium-2505 - - mistralai/mistral-medium-2508 - - mistralai/mistral-medium-latest - - mistralai/mistral-medium - - mistralai/mistral-vibe-cli-with-tools - - mistralai/open-mistral-nemo - - mistralai/open-mistral-nemo-2407 - - mistralai/mistral-tiny-2407 - - mistralai/mistral-tiny-latest - - mistralai/mistral-large-2411 - - mistralai/pixtral-large-2411 - - mistralai/pixtral-large-latest - - mistralai/mistral-large-pixtral-2411 - - mistralai/codestral-2508 - - mistralai/codestral-latest - - mistralai/devstral-small-2507 - - mistralai/devstral-medium-2507 - - mistralai/devstral-2512 - - mistralai/mistral-vibe-cli-latest - - mistralai/devstral-medium-latest - - mistralai/devstral-latest - - mistralai/labs-devstral-small-2512 - - mistralai/devstral-small-latest - - mistralai/mistral-small-2506 - - mistralai/mistral-small-latest - - mistralai/labs-mistral-small-creative - - mistralai/magistral-medium-2509 - - mistralai/magistral-medium-latest - - mistralai/magistral-small-2509 - - mistralai/magistral-small-latest - - mistralai/mistral-large-2512 - - mistralai/mistral-large-latest - - mistralai/ministral-3b-2512 - - mistralai/ministral-3b-latest - - mistralai/ministral-8b-2512 - - mistralai/ministral-8b-latest - - mistralai/ministral-14b-2512 - - mistralai/ministral-14b-latest - - mistralai/mistral-small-2501 - - mistralai/mistral-embed-2312 - - mistralai/mistral-embed - - mistralai/codestral-embed - - mistralai/codestral-embed-2505 - openai: - - openai/gpt-4-0613 - - openai/gpt-4 - - openai/gpt-3.5-turbo - - openai/gpt-5.2-codex - - openai/gpt-3.5-turbo-instruct - - openai/gpt-3.5-turbo-instruct-0914 - - openai/gpt-4-1106-preview - - openai/gpt-3.5-turbo-1106 - - openai/gpt-4-0125-preview - - openai/gpt-4-turbo-preview - - openai/gpt-3.5-turbo-0125 - - openai/gpt-4-turbo - - openai/gpt-4-turbo-2024-04-09 - - openai/gpt-4o - - openai/gpt-4o-2024-05-13 - - openai/gpt-4o-mini-2024-07-18 - - openai/gpt-4o-mini - - openai/gpt-4o-2024-08-06 - - openai/chatgpt-4o-latest - - openai/o1-2024-12-17 - - openai/o1 - - openai/computer-use-preview - - openai/o3-mini - - openai/o3-mini-2025-01-31 - - openai/gpt-4o-2024-11-20 - - openai/computer-use-preview-2025-03-11 - - openai/gpt-4o-search-preview-2025-03-11 - - openai/gpt-4o-search-preview - - openai/gpt-4o-mini-search-preview-2025-03-11 - - openai/gpt-4o-mini-search-preview - - openai/o1-pro-2025-03-19 - - openai/o1-pro - - openai/o3-2025-04-16 - - openai/o4-mini-2025-04-16 - - openai/o3 - - openai/o4-mini - - openai/gpt-4.1-2025-04-14 - - openai/gpt-4.1 - - openai/gpt-4.1-mini-2025-04-14 - - openai/gpt-4.1-mini - - openai/gpt-4.1-nano-2025-04-14 - - openai/gpt-4.1-nano - - openai/o3-pro - - openai/o3-pro-2025-06-10 - - openai/o4-mini-deep-research - - openai/o3-deep-research - - openai/o3-deep-research-2025-06-26 - - openai/o4-mini-deep-research-2025-06-26 - - openai/gpt-5-chat-latest - - openai/gpt-5-2025-08-07 - - openai/gpt-5 - - openai/gpt-5-mini-2025-08-07 - - openai/gpt-5-mini - - openai/gpt-5-nano-2025-08-07 - - openai/gpt-5-nano - - openai/gpt-5-codex - - openai/gpt-5-pro-2025-10-06 - - openai/gpt-5-pro - - openai/gpt-5-search-api - - openai/gpt-5-search-api-2025-10-14 - - openai/gpt-5.1-chat-latest - - openai/gpt-5.1-2025-11-13 - - openai/gpt-5.1 - - openai/gpt-5.1-codex - - openai/gpt-5.1-codex-mini - - openai/gpt-5.1-codex-max - - openai/gpt-5.2-2025-12-11 - - openai/gpt-5.2 - - openai/gpt-5.2-pro-2025-12-11 - - openai/gpt-5.2-pro - - openai/gpt-5.2-chat-latest - - openai/gpt-3.5-turbo-16k - - openai/ft:gpt-3.5-turbo-0613:katanemo::8CMZbm0P deepseek: - deepseek/deepseek-chat - deepseek/deepseek-reasoner - x-ai: - - x-ai/grok-2-vision-1212 - - x-ai/grok-3 - - x-ai/grok-3-mini - - x-ai/grok-4-0709 - - x-ai/grok-4-1-fast-non-reasoning - - x-ai/grok-4-1-fast-reasoning - - x-ai/grok-4-fast-non-reasoning - - x-ai/grok-4-fast-reasoning - - x-ai/grok-code-fast-1 - - x-ai/grok-imagine-image - - x-ai/grok-imagine-video - moonshotai: - - moonshotai/kimi-k2-thinking - - moonshotai/kimi-k2.5 - - moonshotai/moonshot-v1-128k-vision-preview - - moonshotai/moonshot-v1-8k - - moonshotai/kimi-k2-turbo-preview - - moonshotai/moonshot-v1-128k - - moonshotai/moonshot-v1-32k-vision-preview - - moonshotai/kimi-k2-thinking-turbo - - moonshotai/kimi-latest - - moonshotai/moonshot-v1-32k - - moonshotai/moonshot-v1-auto - - moonshotai/kimi-k2-0711-preview - - moonshotai/kimi-k2-0905-preview - - moonshotai/moonshot-v1-8k-vision-preview - anthropic: - - anthropic/claude-opus-4-6 - - anthropic/claude-opus-4-5-20251101 - - anthropic/claude-opus-4-5 - - anthropic/claude-haiku-4-5-20251001 - - anthropic/claude-haiku-4-5 - - anthropic/claude-sonnet-4-5-20250929 - - anthropic/claude-sonnet-4-5 - - anthropic/claude-opus-4-1-20250805 - - anthropic/claude-opus-4-1 - - anthropic/claude-opus-4-20250514 - - anthropic/claude-opus-4 - - anthropic/claude-sonnet-4-20250514 - - anthropic/claude-sonnet-4 - - anthropic/claude-3-7-sonnet-20250219 - - anthropic/claude-3-7-sonnet - - anthropic/claude-3-5-haiku-20241022 - - anthropic/claude-3-5-haiku - - anthropic/claude-3-haiku-20240307 - - anthropic/claude-3-haiku google: - google/gemini-2.5-flash - google/gemini-2.5-pro - google/gemini-2.0-flash - google/gemini-2.0-flash-001 - - google/gemini-2.0-flash-exp-image-generation - google/gemini-2.0-flash-lite-001 - google/gemini-2.0-flash-lite - - google/gemini-exp-1206 - google/gemini-2.5-flash-preview-tts - google/gemini-2.5-pro-preview-tts - google/gemma-3-1b-it @@ -191,12 +24,15 @@ providers: - google/gemini-pro-latest - google/gemini-2.5-flash-lite - google/gemini-2.5-flash-image - - google/gemini-2.5-flash-preview-09-2025 - google/gemini-2.5-flash-lite-preview-09-2025 - google/gemini-3-pro-preview - google/gemini-3-flash-preview + - google/gemini-3.1-pro-preview + - google/gemini-3.1-pro-preview-customtools + - google/gemini-3.1-flash-lite-preview - google/gemini-3-pro-image-preview - google/nano-banana-pro-preview + - google/gemini-3.1-flash-image-preview - google/gemini-robotics-er-1.5-preview - google/gemini-2.5-computer-use-preview-10-2025 - google/deep-research-pro-preview-12-2025 @@ -212,7 +48,37 @@ providers: - amazon/amazon.nova-premier-v1:0 - amazon/amazon.nova-lite-v1:0 - amazon/amazon.nova-micro-v1:0 + x-ai: + - x-ai/grok-3 + - x-ai/grok-3-mini + - x-ai/grok-4-0709 + - x-ai/grok-4-1-fast-non-reasoning + - x-ai/grok-4-1-fast-reasoning + - x-ai/grok-4-fast-non-reasoning + - x-ai/grok-4-fast-reasoning + - x-ai/grok-4.20-beta-0309-non-reasoning + - x-ai/grok-4.20-beta-0309-reasoning + - x-ai/grok-4.20-multi-agent-beta-0309 + - x-ai/grok-code-fast-1 + - x-ai/grok-imagine-image + - x-ai/grok-imagine-video + z-ai: + - z-ai/glm-4.5 + - z-ai/glm-4.5-air + - z-ai/glm-4.6 + - z-ai/glm-4.7 + - z-ai/glm-5 qwen: + - qwen/qwen3-asr-flash-2026-02-10 + - qwen/qwen3.5-flash-2026-02-23 + - qwen/qwen3.5-flash + - qwen/qwen3.5-122b-a10b + - qwen/qwen3.5-35b-a3b + - qwen/qwen3.5-27b + - qwen/qwen3-coder-next + - qwen/qwen3.5-397b-a17b + - qwen/qwen3.5-plus-2026-02-15 + - qwen/qwen3.5-plus - qwen/qwen3-vl-flash-2026-01-22 - qwen/qwen3-max-2026-01-23 - qwen/qwen-plus-character @@ -294,13 +160,161 @@ providers: - qwen/qwen-max - qwen/qwen-plus - qwen/qwen-turbo - z-ai: - - z-ai/glm-4.5 - - z-ai/glm-4.5-air - - z-ai/glm-4.6 - - z-ai/glm-4.7 - - z-ai/glm-5 + mistralai: + - mistralai/mistral-medium-2505 + - mistralai/mistral-medium-2508 + - mistralai/mistral-medium-latest + - mistralai/mistral-medium + - mistralai/mistral-vibe-cli-with-tools + - mistralai/open-mistral-nemo + - mistralai/open-mistral-nemo-2407 + - mistralai/mistral-tiny-2407 + - mistralai/mistral-tiny-latest + - mistralai/codestral-2508 + - mistralai/codestral-latest + - mistralai/devstral-2512 + - mistralai/mistral-vibe-cli-latest + - mistralai/devstral-medium-latest + - mistralai/devstral-latest + - mistralai/mistral-small-2506 + - mistralai/mistral-small-latest + - mistralai/labs-mistral-small-creative + - mistralai/magistral-medium-2509 + - mistralai/magistral-medium-latest + - mistralai/magistral-small-2509 + - mistralai/magistral-small-latest + - mistralai/mistral-large-2512 + - mistralai/mistral-large-latest + - mistralai/ministral-3b-2512 + - mistralai/ministral-3b-latest + - mistralai/ministral-8b-2512 + - mistralai/ministral-8b-latest + - mistralai/ministral-14b-2512 + - mistralai/ministral-14b-latest + - mistralai/mistral-large-2411 + - mistralai/pixtral-large-2411 + - mistralai/pixtral-large-latest + - mistralai/mistral-large-pixtral-2411 + - mistralai/devstral-small-2507 + - mistralai/devstral-medium-2507 + - mistralai/labs-devstral-small-2512 + - mistralai/devstral-small-latest + - mistralai/mistral-squarepoint-2602 + - mistralai/mistral-embed-2312 + - mistralai/mistral-embed + - mistralai/codestral-embed + - mistralai/codestral-embed-2505 + moonshotai: + - moonshotai/kimi-k2.5 + - moonshotai/kimi-k2-0905-preview + - moonshotai/moonshot-v1-32k + - moonshotai/moonshot-v1-128k + - moonshotai/kimi-k2-thinking-turbo + - moonshotai/moonshot-v1-8k-vision-preview + - moonshotai/kimi-k2-0711-preview + - moonshotai/moonshot-v1-auto + - moonshotai/kimi-k2-thinking + - moonshotai/moonshot-v1-128k-vision-preview + - moonshotai/kimi-k2-turbo-preview + - moonshotai/moonshot-v1-32k-vision-preview + - moonshotai/moonshot-v1-8k + anthropic: + - anthropic/claude-sonnet-4-6 + - anthropic/claude-opus-4-6 + - anthropic/claude-opus-4-5-20251101 + - anthropic/claude-opus-4-5 + - anthropic/claude-haiku-4-5-20251001 + - anthropic/claude-haiku-4-5 + - anthropic/claude-sonnet-4-5-20250929 + - anthropic/claude-sonnet-4-5 + - anthropic/claude-opus-4-1-20250805 + - anthropic/claude-opus-4-1 + - anthropic/claude-opus-4-20250514 + - anthropic/claude-opus-4 + - anthropic/claude-sonnet-4-20250514 + - anthropic/claude-sonnet-4 + - anthropic/claude-3-haiku-20240307 + - anthropic/claude-3-haiku + openai: + - openai/gpt-4-0613 + - openai/gpt-4 + - openai/gpt-3.5-turbo + - openai/gpt-5.4 + - openai/gpt-5.3-chat-latest + - openai/gpt-5.4-2026-03-05 + - openai/gpt-5.4-pro + - openai/gpt-5.4-pro-2026-03-05 + - openai/gpt-3.5-turbo-instruct + - openai/gpt-3.5-turbo-instruct-0914 + - openai/gpt-4-1106-preview + - openai/gpt-3.5-turbo-1106 + - openai/gpt-4-0125-preview + - openai/gpt-4-turbo-preview + - openai/gpt-3.5-turbo-0125 + - openai/gpt-4-turbo + - openai/gpt-4-turbo-2024-04-09 + - openai/gpt-4o + - openai/gpt-4o-2024-05-13 + - openai/gpt-4o-mini-2024-07-18 + - openai/gpt-4o-mini + - openai/gpt-4o-2024-08-06 + - openai/o1-2024-12-17 + - openai/o1 + - openai/computer-use-preview + - openai/o3-mini + - openai/o3-mini-2025-01-31 + - openai/gpt-4o-2024-11-20 + - openai/computer-use-preview-2025-03-11 + - openai/gpt-4o-mini-search-preview-2025-03-11 + - openai/gpt-4o-mini-search-preview + - openai/o1-pro-2025-03-19 + - openai/o1-pro + - openai/o3-2025-04-16 + - openai/o4-mini-2025-04-16 + - openai/o3 + - openai/o4-mini + - openai/gpt-4.1-2025-04-14 + - openai/gpt-4.1 + - openai/gpt-4.1-mini-2025-04-14 + - openai/gpt-4.1-mini + - openai/gpt-4.1-nano-2025-04-14 + - openai/gpt-4.1-nano + - openai/o3-pro + - openai/o3-pro-2025-06-10 + - openai/o4-mini-deep-research + - openai/o3-deep-research + - openai/o3-deep-research-2025-06-26 + - openai/o4-mini-deep-research-2025-06-26 + - openai/gpt-5-chat-latest + - openai/gpt-5-2025-08-07 + - openai/gpt-5 + - openai/gpt-5-mini-2025-08-07 + - openai/gpt-5-mini + - openai/gpt-5-nano-2025-08-07 + - openai/gpt-5-nano + - openai/gpt-5-codex + - openai/gpt-5-pro-2025-10-06 + - openai/gpt-5-pro + - openai/gpt-5-search-api + - openai/gpt-5-search-api-2025-10-14 + - openai/gpt-5.1-chat-latest + - openai/gpt-5.1-2025-11-13 + - openai/gpt-5.1 + - openai/gpt-5.1-codex + - openai/gpt-5.1-codex-mini + - openai/gpt-5.1-codex-max + - openai/gpt-5.2-2025-12-11 + - openai/gpt-5.2 + - openai/gpt-5.2-pro-2025-12-11 + - openai/gpt-5.2-pro + - openai/gpt-5.2-chat-latest + - openai/gpt-5.2-codex + - openai/gpt-5.3-codex + - openai/gpt-4o-search-preview + - openai/gpt-4o-search-preview-2025-03-11 + - openai/gpt-3.5-turbo-16k + - openai/ft:gpt-3.5-turbo-0613:katanemo::8CMZbm0P metadata: total_providers: 10 - total_models: 289 - last_updated: 2026-02-13T22:44:30.413065+00:00 + total_models: 303 + last_updated: 2026-03-15T16:47:22.207197+00:00 diff --git a/crates/hermesllm/src/lib.rs b/crates/hermesllm/src/lib.rs index 997fc72a..3b9611e0 100644 --- a/crates/hermesllm/src/lib.rs +++ b/crates/hermesllm/src/lib.rs @@ -35,7 +35,7 @@ mod tests { ProviderId::Mistral ); assert_eq!(ProviderId::try_from("groq").unwrap(), ProviderId::Groq); - assert_eq!(ProviderId::try_from("arch").unwrap(), ProviderId::Arch); + assert_eq!(ProviderId::try_from("plano").unwrap(), ProviderId::Plano); // Test aliases assert_eq!(ProviderId::try_from("google").unwrap(), ProviderId::Gemini); diff --git a/crates/hermesllm/src/providers/id.rs b/crates/hermesllm/src/providers/id.rs index 11008711..9f5f42c9 100644 --- a/crates/hermesllm/src/providers/id.rs +++ b/crates/hermesllm/src/providers/id.rs @@ -34,7 +34,7 @@ pub enum ProviderId { Gemini, Anthropic, GitHub, - Arch, + Plano, AzureOpenAI, XAI, TogetherAI, @@ -58,7 +58,7 @@ impl TryFrom<&str> for ProviderId { "google" => Ok(ProviderId::Gemini), // alias "anthropic" => Ok(ProviderId::Anthropic), "github" => Ok(ProviderId::GitHub), - "arch" => Ok(ProviderId::Arch), + "plano" => Ok(ProviderId::Plano), "azure_openai" => Ok(ProviderId::AzureOpenAI), "xai" => Ok(ProviderId::XAI), "together_ai" => Ok(ProviderId::TogetherAI), @@ -135,7 +135,7 @@ impl ProviderId { | ProviderId::Groq | ProviderId::Mistral | ProviderId::Deepseek - | ProviderId::Arch + | ProviderId::Plano | ProviderId::Gemini | ProviderId::GitHub | ProviderId::AzureOpenAI @@ -153,7 +153,7 @@ impl ProviderId { | ProviderId::Groq | ProviderId::Mistral | ProviderId::Deepseek - | ProviderId::Arch + | ProviderId::Plano | ProviderId::Gemini | ProviderId::GitHub | ProviderId::AzureOpenAI @@ -219,7 +219,7 @@ impl Display for ProviderId { ProviderId::Gemini => write!(f, "Gemini"), ProviderId::Anthropic => write!(f, "Anthropic"), ProviderId::GitHub => write!(f, "GitHub"), - ProviderId::Arch => write!(f, "Arch"), + ProviderId::Plano => write!(f, "Plano"), ProviderId::AzureOpenAI => write!(f, "azure_openai"), ProviderId::XAI => write!(f, "xai"), ProviderId::TogetherAI => write!(f, "together_ai"), diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 7a353bcb..f62631fa 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -873,7 +873,7 @@ impl HttpContext for StreamContext { // ensure that the provider has an endpoint if the access key is missing else return a bad request if self.llm_provider.as_ref().unwrap().endpoint.is_none() && self.llm_provider.as_ref().unwrap().provider_interface - != LlmProviderType::Arch + != LlmProviderType::Plano { self.send_server_error(error, Some(StatusCode::BAD_REQUEST)); } diff --git a/demos/agent_orchestration/travel_agents/README.md b/demos/agent_orchestration/travel_agents/README.md index 7886539d..239ba938 100644 --- a/demos/agent_orchestration/travel_agents/README.md +++ b/demos/agent_orchestration/travel_agents/README.md @@ -123,6 +123,42 @@ Each agent: Both agents run as native local processes and communicate with Plano running natively on the host. +## Running with local Plano-Orchestrator (via vLLM) + +By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model locally using vLLM on a server with an NVIDIA GPU: + +1. Install vLLM and download the model: +```bash +pip install vllm +``` + +2. Start the vLLM server with the 4B model: +```bash +vllm serve katanemo/Plano-Orchestrator-4B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size 1 \ + --gpu-memory-utilization 0.3 \ + --tokenizer katanemo/Plano-Orchestrator-4B \ + --chat-template chat_template.jinja \ + --served-model-name katanemo/Plano-Orchestrator-4B \ + --enable-prefix-caching +``` + +3. Start the demo with the local orchestrator config: +```bash +./run_demo.sh --local-orchestrator +``` + +4. Test with curl: +```bash +curl -X POST http://localhost:8001/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "gpt-5.2", "messages": [{"role": "user", "content": "What is the weather in Istanbul?"}]}' +``` + +You should see Plano use your local orchestrator to route the request to the weather agent. + ## Observability This demo includes full OpenTelemetry (OTel) compatible distributed tracing to monitor and debug agent interactions: diff --git a/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml b/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml new file mode 100644 index 00000000..1d3a0be8 --- /dev/null +++ b/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml @@ -0,0 +1,66 @@ +version: v0.3.0 + +overrides: + agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B + +agents: + - id: weather_agent + url: http://localhost:10510 + - id: flight_agent + url: http://localhost:10520 + +model_providers: + - model: plano/katanemo/Plano-Orchestrator-4B + base_url: http://localhost:8000 + + - model: openai/gpt-5.2 + access_key: $OPENAI_API_KEY + default: true + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY # smaller, faster, cheaper model for extracting entities like location + +listeners: + - type: agent + name: travel_booking_service + port: 8001 + router: plano_orchestrator_v1 + agents: + - id: weather_agent + description: | + + WeatherAgent is a specialized AI assistant for real-time weather information and forecasts. It provides accurate weather data for any city worldwide using the Open-Meteo API, helping travelers plan their trips with up-to-date weather conditions. + + Capabilities: + * Get real-time weather conditions and multi-day forecasts for any city worldwide using Open-Meteo API (free, no API key needed) + * Provides current temperature + * Provides multi-day forecasts + * Provides weather conditions + * Provides sunrise/sunset times + * Provides detailed weather information + * Understands conversation context to resolve location references from previous messages + * Handles weather-related questions including "What's the weather in [city]?", "What's the forecast for [city]?", "How's the weather in [city]?" + * When queries include both weather and other travel questions (e.g., flights, currency), this agent answers ONLY the weather part + + - id: flight_agent + description: | + + FlightAgent is an AI-powered tool specialized in providing live flight information between airports. It leverages the FlightAware AeroAPI to deliver real-time flight status, gate information, and delay updates. + + Capabilities: + * Get live flight information between airports using FlightAware AeroAPI + * Shows real-time flight status + * Shows scheduled/estimated/actual departure and arrival times + * Shows gate and terminal information + * Shows delays + * Shows aircraft type + * Shows flight status + * Automatically resolves city names to airport codes (IATA/ICAO) + * Understands conversation context to infer origin/destination from follow-up questions + * Handles flight-related questions including "What flights go from [city] to [city]?", "Do flights go to [city]?", "Are there direct flights from [city]?" + * When queries include both flight and other travel questions (e.g., weather, currency), this agent answers ONLY the flight part + +tracing: + random_sampling: 100 + span_attributes: + header_prefixes: + - x-acme- diff --git a/demos/agent_orchestration/travel_agents/run_demo.sh b/demos/agent_orchestration/travel_agents/run_demo.sh index 643a0aa2..35166b85 100755 --- a/demos/agent_orchestration/travel_agents/run_demo.sh +++ b/demos/agent_orchestration/travel_agents/run_demo.sh @@ -31,8 +31,13 @@ start_demo() { fi # Step 4: Start Plano - echo "Starting Plano with config.yaml..." - planoai up config.yaml + PLANO_CONFIG="config.yaml" + if [ "$1" == "--local-orchestrator" ]; then + PLANO_CONFIG="config_local_orchestrator.yaml" + echo "Using local orchestrator config..." + fi + echo "Starting Plano with $PLANO_CONFIG..." + planoai up "$PLANO_CONFIG" # Step 5: Start agents natively echo "Starting agents..." diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md index 85d56abf..72b672f3 100644 --- a/demos/llm_routing/model_routing_service/README.md +++ b/demos/llm_routing/model_routing_service/README.md @@ -1,6 +1,54 @@ # Model Routing Service Demo -This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select. +Plano is an AI-native proxy and data plane for agentic apps — with built-in orchestration, safety, observability, and intelligent LLM routing. + +``` +┌───────────┐ ┌─────────────────────────────────┐ ┌──────────────┐ +│ Client │ ───► │ Plano │ ───► │ OpenAI │ +│ (any │ │ │ │ Anthropic │ +│ language)│ │ Arch-Router (1.5B model) │ │ Any Provider│ +└───────────┘ │ analyzes intent → picks model │ └──────────────┘ + └─────────────────────────────────┘ +``` + +- **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover +- **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request +- **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code +- **Runs anywhere** — single binary; self-host the router for full data privacy + +## How Routing Works + +The entire routing configuration is plain YAML — no code: + +```yaml +model_providers: + - model: openai/gpt-4o-mini + default: true # fallback for unmatched requests + + - model: openai/gpt-4o + routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis + + - model: anthropic/claude-sonnet-4-20250514 + routing_preferences: + - name: code_generation + description: generating new code, writing functions +``` + +When a request arrives, Plano sends the conversation and routing preferences to Arch-Router, which classifies the intent and returns the matching route: + +``` +1. Request arrives → "Write binary search in Python" +2. Preferences serialized → [{"name":"code_generation", ...}, {"name":"complex_reasoning", ...}] +3. Arch-Router classifies → {"route": "code_generation"} +4. Route → Model lookup → code_generation → anthropic/claude-sonnet-4-20250514 +5. Request forwarded → Claude generates the response +``` + +No match? Arch-Router returns `other` → Plano falls back to the default model. + +The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing and validating routing behavior before going to production. ## Setup @@ -55,6 +103,69 @@ Response: The response tells you which model would handle this request and which route was matched, without actually making the LLM call. +## Kubernetes Deployment (Self-hosted Arch-Router on GPU) + +To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint: + +**0. Check your GPU node labels and taints** + +```bash +kubectl get nodes --show-labels | grep -i gpu +kubectl get node -o jsonpath='{.spec.taints}' +``` + +GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment.yaml` includes a matching toleration. If you have multiple GPU node pools and need to pin to a specific one, uncomment and set the `nodeSelector` in `vllm-deployment.yaml` using the label for your cloud provider. + +**1. Deploy Arch-Router and Plano:** + +```bash + +# arch-router deployment +kubectl apply -f vllm-deployment.yaml + +# plano deployment +kubectl create secret generic plano-secrets \ + --from-literal=OPENAI_API_KEY=$OPENAI_API_KEY \ + --from-literal=ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY + +kubectl create configmap plano-config \ + --from-file=plano_config.yaml=config_k8s.yaml \ + --dry-run=client -o yaml | kubectl apply -f - + +kubectl apply -f plano-deployment.yaml +``` + +**3. Wait for both pods to be ready:** + +```bash +# Arch-Router downloads the model (~1 min) then vLLM loads it (~2 min) +kubectl get pods -l app=arch-router -w +kubectl rollout status deployment/plano +``` + +**4. Test:** + +```bash +kubectl port-forward svc/plano 12000:12000 +./demo.sh +``` + +To confirm requests are hitting your in-cluster Arch-Router (not just health checks): + +```bash +kubectl logs -l app=arch-router -f --tail=0 +# Look for POST /v1/chat/completions entries +``` + +**Updating the config:** + +```bash +kubectl create configmap plano-config \ + --from-file=plano_config.yaml=config_k8s.yaml \ + --dry-run=client -o yaml | kubectl apply -f - +kubectl rollout restart deployment/plano +``` + ## Demo Output ``` diff --git a/demos/llm_routing/model_routing_service/config_k8s.yaml b/demos/llm_routing/model_routing_service/config_k8s.yaml new file mode 100644 index 00000000..bdf98bfa --- /dev/null +++ b/demos/llm_routing/model_routing_service/config_k8s.yaml @@ -0,0 +1,33 @@ +version: v0.3.0 + +overrides: + llm_routing_model: plano/Arch-Router + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + + - model: plano/Arch-Router + base_url: http://arch-router:10000 + + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis, or detailed explanations + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: code_generation + description: generating new code, writing functions, or creating boilerplate + +tracing: + random_sampling: 100 diff --git a/demos/llm_routing/model_routing_service/plano-deployment.yaml b/demos/llm_routing/model_routing_service/plano-deployment.yaml new file mode 100644 index 00000000..e093f404 --- /dev/null +++ b/demos/llm_routing/model_routing_service/plano-deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: plano + labels: + app: plano +spec: + replicas: 1 + selector: + matchLabels: + app: plano + template: + metadata: + labels: + app: plano + spec: + containers: + - name: plano + image: katanemo/plano:0.4.12 + ports: + - containerPort: 12000 # LLM gateway (chat completions, model routing) + name: llm-gateway + envFrom: + - secretRef: + name: plano-secrets + env: + - name: LOG_LEVEL + value: "info" + volumeMounts: + - name: plano-config + mountPath: /app/plano_config.yaml + subPath: plano_config.yaml + readOnly: true + readinessProbe: + httpGet: + path: /healthz + port: 12000 + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /healthz + port: 12000 + initialDelaySeconds: 10 + periodSeconds: 30 + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "1000m" + volumes: + - name: plano-config + configMap: + name: plano-config +--- +apiVersion: v1 +kind: Service +metadata: + name: plano +spec: + selector: + app: plano + ports: + - name: llm-gateway + port: 12000 + targetPort: 12000 diff --git a/demos/llm_routing/model_routing_service/test.rest b/demos/llm_routing/model_routing_service/test.rest new file mode 100644 index 00000000..b41d75f2 --- /dev/null +++ b/demos/llm_routing/model_routing_service/test.rest @@ -0,0 +1,36 @@ +### Code generation query (OpenAI format) — expects anthropic/claude-sonnet +POST http://localhost:12000/routing/v1/chat/completions +Content-Type: application/json + +{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Write a Python function for binary search"}] +} + +### Complex reasoning query (OpenAI format) — expects openai/gpt-4o +POST http://localhost:12000/routing/v1/chat/completions +Content-Type: application/json + +{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Analyze the trade-offs between microservices and monolithic architecture"}] +} + +### Simple query — no routing match, expects default model +POST http://localhost:12000/routing/v1/chat/completions +Content-Type: application/json + +{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Hello"}] +} + +### Code generation query (Anthropic format) +POST http://localhost:12000/routing/v1/messages +Content-Type: application/json + +{ + "model": "claude-sonnet-4-20250514", + "max_tokens": 1024, + "messages": [{"role": "user", "content": "Write a REST API in Go using Gin"}] +} diff --git a/demos/llm_routing/model_routing_service/vllm-deployment.yaml b/demos/llm_routing/model_routing_service/vllm-deployment.yaml new file mode 100644 index 00000000..1debe15e --- /dev/null +++ b/demos/llm_routing/model_routing_service/vllm-deployment.yaml @@ -0,0 +1,104 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: arch-router + labels: + app: arch-router +spec: + replicas: 1 + selector: + matchLabels: + app: arch-router + template: + metadata: + labels: + app: arch-router + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # Optional: add a nodeSelector to pin to a specific GPU node pool. + # The nvidia.com/gpu resource request below is sufficient for most clusters. + # nodeSelector: + # DigitalOcean: doks.digitalocean.com/gpu-model: l40s + # GKE: cloud.google.com/gke-accelerator: nvidia-l4 + # EKS: eks.amazonaws.com/nodegroup: gpu-nodes + # AKS: kubernetes.azure.com/agentpool: gpupool + initContainers: + - name: download-model + image: python:3.11-slim + command: + - sh + - -c + - | + pip install huggingface_hub[cli] && \ + python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')" + volumeMounts: + - name: model-cache + mountPath: /models + containers: + - name: vllm + image: vllm/vllm-openai:latest + command: + - vllm + - serve + - /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf + - "--host" + - "0.0.0.0" + - "--port" + - "10000" + - "--load-format" + - "gguf" + - "--tokenizer" + - "katanemo/Arch-Router-1.5B" + - "--served-model-name" + - "Arch-Router" + - "--gpu-memory-utilization" + - "0.3" + - "--tensor-parallel-size" + - "1" + - "--enable-prefix-caching" + ports: + - name: http + containerPort: 10000 + protocol: TCP + resources: + requests: + cpu: "1" + memory: "4Gi" + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: "8Gi" + nvidia.com/gpu: "1" + volumeMounts: + - name: model-cache + mountPath: /models + readinessProbe: + httpGet: + path: /health + port: 10000 + initialDelaySeconds: 60 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /health + port: 10000 + initialDelaySeconds: 180 + periodSeconds: 30 + volumes: + - name: model-cache + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: arch-router +spec: + selector: + app: arch-router + ports: + - name: http + port: 10000 + targetPort: 10000 diff --git a/demos/llm_routing/openclaw_routing/config.yaml b/demos/llm_routing/openclaw_routing/config.yaml index 3106b5dd..9690e747 100644 --- a/demos/llm_routing/openclaw_routing/config.yaml +++ b/demos/llm_routing/openclaw_routing/config.yaml @@ -1,8 +1,7 @@ version: v0.1.0 -routing: - model: Arch-Router - llm_provider: arch-router +overrides: + llm_routing_model: Arch-Router listeners: egress_traffic: diff --git a/demos/llm_routing/preference_based_routing/plano_config_local.yaml b/demos/llm_routing/preference_based_routing/plano_config_local.yaml index dbd287dd..01adb097 100644 --- a/demos/llm_routing/preference_based_routing/plano_config_local.yaml +++ b/demos/llm_routing/preference_based_routing/plano_config_local.yaml @@ -1,8 +1,7 @@ version: v0.3.0 -routing: - model: Arch-Router - llm_provider: arch-router +overrides: + llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M listeners: - type: model @@ -11,8 +10,7 @@ listeners: model_providers: - - name: arch-router - model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M + - model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M base_url: http://localhost:11434 - model: openai/gpt-4o-mini diff --git a/docs/source/conf.py b/docs/source/conf.py index ec476136..e554329f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons project = "Plano Docs" copyright = "2025, Katanemo Labs, Inc" author = "Katanemo Labs, Inc" -release = " v0.4.11" +release = " v0.4.12" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst index 279fde2d..9d51d1c4 100644 --- a/docs/source/get_started/quickstart.rst +++ b/docs/source/get_started/quickstart.rst @@ -43,7 +43,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins .. code-block:: console - $ uv tool install planoai==0.4.11 + $ uv tool install planoai==0.4.12 **Option 2: Install with pip (Traditional)** @@ -51,7 +51,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins $ python -m venv venv $ source venv/bin/activate # On Windows, use: venv\Scripts\activate - $ pip install planoai==0.4.11 + $ pip install planoai==0.4.12 .. _llm_routing_quickstart: diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst index 41c51b4a..7c4ad685 100644 --- a/docs/source/guides/llm_router.rst +++ b/docs/source/guides/llm_router.rst @@ -253,13 +253,11 @@ Using Ollama (recommended for local development) .. code-block:: yaml - routing: - model: Arch-Router - llm_provider: arch-router + overrides: + llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M model_providers: - - name: arch-router - model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M + - model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M base_url: http://localhost:11434 - model: openai/gpt-5.2 @@ -324,13 +322,11 @@ vLLM provides higher throughput and GPU optimizations suitable for production de .. code-block:: yaml - routing: - model: Arch-Router - llm_provider: arch-router + overrides: + llm_routing_model: plano/Arch-Router model_providers: - - name: arch-router - model: Arch-Router + - model: plano/Arch-Router base_url: http://:10000 - model: openai/gpt-5.2 @@ -351,6 +347,35 @@ vLLM provides higher throughput and GPU optimizations suitable for production de curl http://localhost:10000/v1/models +Using vLLM on Kubernetes (GPU nodes) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For teams running Kubernetes, Arch-Router and Plano can be deployed as in-cluster services. +The ``demos/llm_routing/model_routing_service/`` directory includes ready-to-use manifests: + +- ``vllm-deployment.yaml`` — Arch-Router served by vLLM, with an init container to download + the model from HuggingFace +- ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Arch-Router +- ``config_k8s.yaml`` — Plano config with ``llm_routing_model`` pointing at + ``http://arch-router:10000`` instead of the default hosted endpoint + +Key things to know before deploying: + +- GPU nodes commonly have a ``nvidia.com/gpu:NoSchedule`` taint — the ``vllm-deployment.yaml`` + includes a matching toleration. The ``nvidia.com/gpu: "1"`` resource request is sufficient + for scheduling in most clusters; a ``nodeSelector`` is optional and commented out in the + manifest for cases where you need to pin to a specific GPU node pool. +- Model download takes ~1 minute; vLLM loads the model in ~1-2 minutes after that. The + ``livenessProbe`` has a 180-second ``initialDelaySeconds`` to avoid premature restarts. +- The Plano config ConfigMap must use ``--from-file=plano_config.yaml=config_k8s.yaml`` with + ``subPath`` in the Deployment — omitting ``subPath`` causes Kubernetes to mount a directory + instead of a file. + +For the canonical Plano Kubernetes deployment (ConfigMap, Secrets, Deployment YAML), see +:ref:`deployment`. For full step-by-step commands specific to this demo, see the +`demo README `_. + + Combining Routing Methods ------------------------- diff --git a/docs/source/guides/orchestration.rst b/docs/source/guides/orchestration.rst index 3170b65f..1a153e83 100644 --- a/docs/source/guides/orchestration.rst +++ b/docs/source/guides/orchestration.rst @@ -335,6 +335,90 @@ Combine RAG agents for documentation lookup with specialized troubleshooting age - id: troubleshoot_agent description: Diagnoses and resolves technical issues step by step +Self-hosting Plano-Orchestrator +------------------------------- + +By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model, you can serve it using **vLLM** on a server with an NVIDIA GPU. + +.. note:: + vLLM requires a Linux server with an NVIDIA GPU (CUDA). For local development on macOS, a GGUF version for Ollama is coming soon. + +The following model variants are available on HuggingFace: + +* `Plano-Orchestrator-4B `_ — lighter model, suitable for development and testing +* `Plano-Orchestrator-4B-FP8 `_ — FP8 quantized 4B model, lower memory usage +* `Plano-Orchestrator-30B-A3B `_ — full-size model for production +* `Plano-Orchestrator-30B-A3B-FP8 `_ — FP8 quantized 30B model, recommended for production deployments + +Using vLLM +~~~~~~~~~~ + +1. **Install vLLM** + + .. code-block:: bash + + pip install vllm + +2. **Download the model and chat template** + + .. code-block:: bash + + pip install huggingface_hub + huggingface-cli download katanemo/Plano-Orchestrator-4B + +3. **Start the vLLM server** + + For the 4B model (development): + + .. code-block:: bash + + vllm serve katanemo/Plano-Orchestrator-4B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size 1 \ + --gpu-memory-utilization 0.3 \ + --tokenizer katanemo/Plano-Orchestrator-4B \ + --chat-template chat_template.jinja \ + --served-model-name katanemo/Plano-Orchestrator-4B \ + --enable-prefix-caching + + For the 30B-A3B-FP8 model (production): + + .. code-block:: bash + + vllm serve katanemo/Plano-Orchestrator-30B-A3B-FP8 \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size 1 \ + --gpu-memory-utilization 0.9 \ + --tokenizer katanemo/Plano-Orchestrator-30B-A3B-FP8 \ + --chat-template chat_template.jinja \ + --max-model-len 32768 \ + --served-model-name katanemo/Plano-Orchestrator-30B-A3B-FP8 \ + --enable-prefix-caching + +4. **Configure Plano to use the local orchestrator** + + Use the model name matching your ``--served-model-name``: + + .. code-block:: yaml + + overrides: + agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B + + model_providers: + - model: katanemo/Plano-Orchestrator-4B + provider_interface: plano + base_url: http://:8000 + +5. **Verify the server is running** + + .. code-block:: bash + + curl http://localhost:8000/health + curl http://localhost:8000/v1/models + + Next Steps ---------- diff --git a/docs/source/resources/deployment.rst b/docs/source/resources/deployment.rst index 7b8b0554..2689384e 100644 --- a/docs/source/resources/deployment.rst +++ b/docs/source/resources/deployment.rst @@ -65,7 +65,7 @@ Create a ``docker-compose.yml`` file with the following configuration: # docker-compose.yml services: plano: - image: katanemo/plano:0.4.11 + image: katanemo/plano:0.4.12 container_name: plano ports: - "10000:10000" # ingress (client -> plano) @@ -153,7 +153,7 @@ Create a ``plano-deployment.yaml``: spec: containers: - name: plano - image: katanemo/plano:0.4.11 + image: katanemo/plano:0.4.12 ports: - containerPort: 12000 # LLM gateway (chat completions, model routing) name: llm-gateway diff --git a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml index 9717b53a..64ee1f91 100644 --- a/docs/source/resources/includes/plano_config_full_reference_rendered.yaml +++ b/docs/source/resources/includes/plano_config_full_reference_rendered.yaml @@ -107,11 +107,11 @@ model_providers: - internal: true model: Arch-Function name: arch-function - provider_interface: arch + provider_interface: plano - internal: true model: Plano-Orchestrator - name: plano-orchestrator - provider_interface: arch + name: plano/orchestrator + provider_interface: plano prompt_targets: - description: Get current weather at a location. endpoint: