resolve merge conflict in main.rs

This commit is contained in:
Adil Hafeez 2026-03-16 12:40:33 -07:00
commit 80dfb41cad
No known key found for this signature in database
GPG key ID: 9B18EF7691369645
40 changed files with 920 additions and 301 deletions

View file

@ -25,4 +25,6 @@ Update the version string in ALL of these files:
Do NOT change version strings in `*.lock` files or `Cargo.lock`. Do NOT change version strings in `*.lock` files or `Cargo.lock`.
After updating all version strings, run `cd cli && uv lock` to update the lock file with the new version.
After making changes, show a summary of all files modified and the old → new version. After making changes, show a summary of all files modified and the old → new version.

View file

@ -133,13 +133,13 @@ jobs:
load: true load: true
tags: | tags: |
${{ env.PLANO_DOCKER_IMAGE }} ${{ env.PLANO_DOCKER_IMAGE }}
${{ env.DOCKER_IMAGE }}:0.4.11 ${{ env.DOCKER_IMAGE }}:0.4.12
${{ env.DOCKER_IMAGE }}:latest ${{ env.DOCKER_IMAGE }}:latest
cache-from: type=gha cache-from: type=gha
cache-to: type=gha,mode=max cache-to: type=gha,mode=max
- name: Save image as artifact - name: Save image as artifact
run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.11 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar run: docker save ${{ env.PLANO_DOCKER_IMAGE }} ${{ env.DOCKER_IMAGE }}:0.4.12 ${{ env.DOCKER_IMAGE }}:latest -o /tmp/plano-image.tar
- name: Upload image artifact - name: Upload image artifact
uses: actions/upload-artifact@v6 uses: actions/upload-artifact@v6

1
.gitignore vendored
View file

@ -152,3 +152,4 @@ apps/*/dist/
.cursor/ .cursor/
.agents .agents
docs/do/

View file

@ -4,6 +4,7 @@ repos:
hooks: hooks:
- id: check-yaml - id: check-yaml
exclude: config/envoy.template* exclude: config/envoy.template*
args: [--allow-multiple-documents]
- id: end-of-file-fixer - id: end-of-file-fixer
- id: trailing-whitespace - id: trailing-whitespace
- repo: local - repo: local

View file

@ -24,7 +24,7 @@ export function Hero() {
> >
<div className="inline-flex flex-wrap items-center gap-1.5 sm:gap-2 px-3 sm:px-4 py-1 rounded-full bg-[rgba(185,191,255,0.4)] border border-[var(--secondary)] shadow backdrop-blur hover:bg-[rgba(185,191,255,0.6)] transition-colors cursor-pointer"> <div className="inline-flex flex-wrap items-center gap-1.5 sm:gap-2 px-3 sm:px-4 py-1 rounded-full bg-[rgba(185,191,255,0.4)] border border-[var(--secondary)] shadow backdrop-blur hover:bg-[rgba(185,191,255,0.6)] transition-colors cursor-pointer">
<span className="text-xs sm:text-sm font-medium text-black/65"> <span className="text-xs sm:text-sm font-medium text-black/65">
v0.4.11 v0.4.12
</span> </span>
<span className="text-xs sm:text-sm font-medium text-black "> <span className="text-xs sm:text-sm font-medium text-black ">

View file

@ -1 +1 @@
docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.11 docker build -f Dockerfile . -t katanemo/plano -t katanemo/plano:0.4.12

View file

@ -1,3 +1,3 @@
"""Plano CLI - Intelligent Prompt Gateway.""" """Plano CLI - Intelligent Prompt Gateway."""
__version__ = "0.4.11" __version__ = "0.4.12"

View file

@ -3,18 +3,17 @@ import os
from planoai.utils import convert_legacy_listeners from planoai.utils import convert_legacy_listeners
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
import yaml import yaml
from jsonschema import validate from jsonschema import validate, ValidationError
from urllib.parse import urlparse from urllib.parse import urlparse
from copy import deepcopy from copy import deepcopy
from planoai.consts import DEFAULT_OTEL_TRACING_GRPC_ENDPOINT from planoai.consts import DEFAULT_OTEL_TRACING_GRPC_ENDPOINT
SUPPORTED_PROVIDERS_WITH_BASE_URL = [ SUPPORTED_PROVIDERS_WITH_BASE_URL = [
"azure_openai", "azure_openai",
"ollama", "ollama",
"qwen", "qwen",
"amazon_bedrock", "amazon_bedrock",
"arch", "plano",
] ]
SUPPORTED_PROVIDERS_WITHOUT_BASE_URL = [ SUPPORTED_PROVIDERS_WITHOUT_BASE_URL = [
@ -368,47 +367,52 @@ def validate_and_render_schema():
llms_with_endpoint.append(model_provider) llms_with_endpoint.append(model_provider)
llms_with_endpoint_cluster_names.add(cluster_name) llms_with_endpoint_cluster_names.add(cluster_name)
if len(model_usage_name_keys) > 0: overrides_config = config_yaml.get("overrides", {})
routing_model_provider = config_yaml.get("routing", {}).get( # Build lookup of model names (already prefix-stripped by config processing)
"model_provider", None model_name_set = {mp.get("model") for mp in updated_model_providers}
# Auto-add arch-router provider if routing preferences exist and no provider matches the router model
router_model = overrides_config.get("llm_routing_model", "Arch-Router")
# Strip provider prefix for comparison since config processing strips prefixes from model names
router_model_id = (
router_model.split("/", 1)[1] if "/" in router_model else router_model
)
if len(model_usage_name_keys) > 0 and router_model_id not in model_name_set:
updated_model_providers.append(
{
"name": "arch-router",
"provider_interface": "plano",
"model": router_model_id,
"internal": True,
}
) )
if (
routing_model_provider
and routing_model_provider not in model_provider_name_set
):
raise Exception(
f"Routing model_provider {routing_model_provider} is not defined in model_providers"
)
if (
routing_model_provider is None
and "arch-router" not in model_provider_name_set
):
updated_model_providers.append(
{
"name": "arch-router",
"provider_interface": "arch",
"model": config_yaml.get("routing", {}).get("model", "Arch-Router"),
"internal": True,
}
)
# Always add arch-function model provider if not already defined # Always add arch-function model provider if not already defined
if "arch-function" not in model_provider_name_set: if "arch-function" not in model_provider_name_set:
updated_model_providers.append( updated_model_providers.append(
{ {
"name": "arch-function", "name": "arch-function",
"provider_interface": "arch", "provider_interface": "plano",
"model": "Arch-Function", "model": "Arch-Function",
"internal": True, "internal": True,
} }
) )
if "plano-orchestrator" not in model_provider_name_set: # Auto-add plano-orchestrator provider if no provider matches the orchestrator model
orchestrator_model = overrides_config.get(
"agent_orchestration_model", "Plano-Orchestrator"
)
orchestrator_model_id = (
orchestrator_model.split("/", 1)[1]
if "/" in orchestrator_model
else orchestrator_model
)
if orchestrator_model_id not in model_name_set:
updated_model_providers.append( updated_model_providers.append(
{ {
"name": "plano-orchestrator", "name": "plano/orchestrator",
"provider_interface": "arch", "provider_interface": "plano",
"model": "Plano-Orchestrator", "model": orchestrator_model_id,
"internal": True, "internal": True,
} }
) )
@ -513,11 +517,15 @@ def validate_prompt_config(plano_config_file, plano_config_schema_file):
try: try:
validate(config_yaml, config_schema_yaml) validate(config_yaml, config_schema_yaml)
except Exception as e: except ValidationError as e:
print( path = (
f"Error validating plano_config file: {plano_config_file}, schema file: {plano_config_schema_file}, error: {e}" "".join(str(p) for p in e.absolute_path) if e.absolute_path else "root"
) )
raise e raise ValidationError(
f"{e.message}\n Location: {path}\n Value: {e.instance}"
) from None
except Exception as e:
raise
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -5,7 +5,7 @@ PLANO_COLOR = "#969FF4"
SERVICE_NAME_ARCHGW = "plano" SERVICE_NAME_ARCHGW = "plano"
PLANO_DOCKER_NAME = "plano" PLANO_DOCKER_NAME = "plano"
PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.11") PLANO_DOCKER_IMAGE = os.getenv("PLANO_DOCKER_IMAGE", "katanemo/plano:0.4.12")
DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317" DEFAULT_OTEL_TRACING_GRPC_ENDPOINT = "http://localhost:4317"
# Native mode constants # Native mode constants

View file

@ -420,9 +420,16 @@ def native_validate_config(plano_config_file):
with _temporary_env(overrides): with _temporary_env(overrides):
from planoai.config_generator import validate_and_render_schema from planoai.config_generator import validate_and_render_schema
# Suppress verbose print output from config_generator # Suppress verbose print output from config_generator but capture errors
with contextlib.redirect_stdout(io.StringIO()): captured = io.StringIO()
validate_and_render_schema() try:
with contextlib.redirect_stdout(captured):
validate_and_render_schema()
except SystemExit:
# validate_and_render_schema calls exit(1) on failure after
# printing to stdout; re-raise so the caller gets a useful message.
output = captured.getvalue().strip()
raise Exception(output) if output else Exception("Config validation failed")
def native_logs(debug=False, follow=False): def native_logs(debug=False, follow=False):

View file

@ -1,6 +1,6 @@
[project] [project]
name = "planoai" name = "planoai"
version = "0.4.11" version = "0.4.12"
description = "Python-based CLI tool to manage Plano." description = "Python-based CLI tool to manage Plano."
authors = [{name = "Katanemo Labs, Inc."}] authors = [{name = "Katanemo Labs, Inc."}]
readme = "README.md" readme = "README.md"

2
cli/uv.lock generated
View file

@ -337,7 +337,7 @@ wheels = [
[[package]] [[package]]
name = "planoai" name = "planoai"
version = "0.4.9" version = "0.4.12"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "click" }, { name = "click" },

View file

@ -594,13 +594,13 @@ static_resources:
clusters: clusters:
- name: arch - name: plano
connect_timeout: {{ upstream_connect_timeout | default('5s') }} connect_timeout: {{ upstream_connect_timeout | default('5s') }}
type: LOGICAL_DNS type: LOGICAL_DNS
dns_lookup_family: V4_ONLY dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN lb_policy: ROUND_ROBIN
load_assignment: load_assignment:
cluster_name: arch cluster_name: plano
endpoints: endpoints:
- lb_endpoints: - lb_endpoints:
- endpoint: - endpoint:

View file

@ -181,7 +181,7 @@ properties:
provider_interface: provider_interface:
type: string type: string
enum: enum:
- arch - plano
- claude - claude
- deepseek - deepseek
- groq - groq
@ -228,7 +228,7 @@ properties:
provider_interface: provider_interface:
type: string type: string
enum: enum:
- arch - plano
- claude - claude
- deepseek - deepseek
- groq - groq
@ -279,6 +279,12 @@ properties:
upstream_tls_ca_path: upstream_tls_ca_path:
type: string type: string
description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'." description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'."
llm_routing_model:
type: string
description: "Model name for the LLM router (e.g., 'Arch-Router'). Must match a model in model_providers."
agent_orchestration_model:
type: string
description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
system_prompt: system_prompt:
type: string type: string
prompt_targets: prompt_targets:
@ -416,14 +422,6 @@ properties:
enum: enum:
- llm - llm
- prompt - prompt
routing:
type: object
properties:
llm_provider:
type: string
model:
type: string
additionalProperties: false
state_storage: state_storage:
type: object type: object
properties: properties:

View file

@ -178,6 +178,7 @@ mod tests {
Arc::new(OrchestratorService::new( Arc::new(OrchestratorService::new(
"http://localhost:8080".to_string(), "http://localhost:8080".to_string(),
"test-model".to_string(), "test-model".to_string(),
"plano-orchestrator".to_string(),
)) ))
} }

View file

@ -23,6 +23,7 @@ mod tests {
Arc::new(OrchestratorService::new( Arc::new(OrchestratorService::new(
"http://localhost:8080".to_string(), "http://localhost:8080".to_string(),
"test-model".to_string(), "test-model".to_string(),
"plano-orchestrator".to_string(),
)) ))
} }

View file

@ -11,9 +11,7 @@ use brightstaff::state::StateStorage;
use brightstaff::utils::tracing::init_tracer; use brightstaff::utils::tracing::init_tracer;
use bytes::Bytes; use bytes::Bytes;
use common::configuration::{Agent, Configuration, ListenerType}; use common::configuration::{Agent, Configuration, ListenerType};
use common::consts::{ use common::consts::{CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH};
CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH, PLANO_ORCHESTRATOR_MODEL_NAME,
};
use common::llm_providers::LlmProviders; use common::llm_providers::LlmProviders;
use http_body_util::{combinators::BoxBody, BodyExt, Empty}; use http_body_util::{combinators::BoxBody, BodyExt, Empty};
use hyper::body::Incoming; use hyper::body::Incoming;
@ -36,6 +34,8 @@ pub mod router;
const BIND_ADDRESS: &str = "0.0.0.0:9091"; const BIND_ADDRESS: &str = "0.0.0.0:9091";
const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router"; const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router";
const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router"; const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router";
const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator";
const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
// Utility function to extract the context from the incoming request headers // Utility function to extract the context from the incoming request headers
fn extract_context_from_request(req: &Request<Incoming>) -> Context { fn extract_context_from_request(req: &Request<Incoming>) -> Context {
@ -139,16 +139,21 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
env::var("LLM_PROVIDER_ENDPOINT").unwrap_or_else(|_| "http://localhost:12001".to_string()); env::var("LLM_PROVIDER_ENDPOINT").unwrap_or_else(|_| "http://localhost:12001".to_string());
let listener = TcpListener::bind(bind_address).await?; let listener = TcpListener::bind(bind_address).await?;
let routing_model_name: String = plano_config let overrides = plano_config.overrides.clone().unwrap_or_default();
.routing
.as_ref() // Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests
.and_then(|r| r.model.clone()) let routing_model_name: String = overrides
.unwrap_or_else(|| DEFAULT_ROUTING_MODEL_NAME.to_string()); .llm_routing_model
.as_deref()
.map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
.unwrap_or(DEFAULT_ROUTING_MODEL_NAME)
.to_string();
let routing_llm_provider = plano_config let routing_llm_provider = plano_config
.routing .model_providers
.as_ref() .iter()
.and_then(|r| r.model_provider.clone()) .find(|p| p.model.as_deref() == Some(routing_model_name.as_str()))
.map(|p| p.name.clone())
.unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string()); .unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string());
let router_service: Arc<RouterService> = Arc::new(RouterService::new( let router_service: Arc<RouterService> = Arc::new(RouterService::new(
@ -158,9 +163,25 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
routing_llm_provider, routing_llm_provider,
)); ));
// Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests
let orchestrator_model_name: String = overrides
.agent_orchestration_model
.as_deref()
.map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
.unwrap_or(DEFAULT_ORCHESTRATOR_MODEL_NAME)
.to_string();
let orchestrator_llm_provider: String = plano_config
.model_providers
.iter()
.find(|p| p.model.as_deref() == Some(orchestrator_model_name.as_str()))
.map(|p| p.name.clone())
.unwrap_or_else(|| DEFAULT_ORCHESTRATOR_LLM_PROVIDER.to_string());
let orchestrator_service: Arc<OrchestratorService> = Arc::new(OrchestratorService::new( let orchestrator_service: Arc<OrchestratorService> = Arc::new(OrchestratorService::new(
format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"), format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
PLANO_ORCHESTRATOR_MODEL_NAME.to_string(), orchestrator_model_name,
orchestrator_llm_provider,
)); ));
let model_aliases = Arc::new(plano_config.model_aliases.clone()); let model_aliases = Arc::new(plano_config.model_aliases.clone());

View file

@ -2,7 +2,7 @@ use std::{collections::HashMap, sync::Arc};
use common::{ use common::{
configuration::{AgentUsagePreference, OrchestrationPreference}, configuration::{AgentUsagePreference, OrchestrationPreference},
consts::{ARCH_PROVIDER_HINT_HEADER, PLANO_ORCHESTRATOR_MODEL_NAME, REQUEST_ID_HEADER}, consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER},
}; };
use hermesllm::apis::openai::{ChatCompletionsResponse, Message}; use hermesllm::apis::openai::{ChatCompletionsResponse, Message};
use hyper::header; use hyper::header;
@ -19,6 +19,7 @@ pub struct OrchestratorService {
orchestrator_url: String, orchestrator_url: String,
client: reqwest::Client, client: reqwest::Client,
orchestrator_model: Arc<dyn OrchestratorModel>, orchestrator_model: Arc<dyn OrchestratorModel>,
orchestrator_provider_name: String,
} }
#[derive(Debug, Error)] #[derive(Debug, Error)]
@ -36,7 +37,11 @@ pub enum OrchestrationError {
pub type Result<T> = std::result::Result<T, OrchestrationError>; pub type Result<T> = std::result::Result<T, OrchestrationError>;
impl OrchestratorService { impl OrchestratorService {
pub fn new(orchestrator_url: String, orchestration_model_name: String) -> Self { pub fn new(
orchestrator_url: String,
orchestration_model_name: String,
orchestrator_provider_name: String,
) -> Self {
// Empty agent orchestrations - will be provided via usage_preferences in requests // Empty agent orchestrations - will be provided via usage_preferences in requests
let agent_orchestrations: HashMap<String, Vec<OrchestrationPreference>> = HashMap::new(); let agent_orchestrations: HashMap<String, Vec<OrchestrationPreference>> = HashMap::new();
@ -50,6 +55,7 @@ impl OrchestratorService {
orchestrator_url, orchestrator_url,
client: reqwest::Client::new(), client: reqwest::Client::new(),
orchestrator_model, orchestrator_model,
orchestrator_provider_name,
} }
} }
@ -75,12 +81,12 @@ impl OrchestratorService {
debug!( debug!(
model = %self.orchestrator_model.get_model_name(), model = %self.orchestrator_model.get_model_name(),
endpoint = %self.orchestrator_url, endpoint = %self.orchestrator_url,
"sending request to arch-orchestrator" "sending request to plano-orchestrator"
); );
debug!( debug!(
body = %serde_json::to_string(&orchestrator_request).unwrap(), body = %serde_json::to_string(&orchestrator_request).unwrap(),
"arch orchestrator request" "plano orchestrator request"
); );
let mut orchestration_request_headers = header::HeaderMap::new(); let mut orchestration_request_headers = header::HeaderMap::new();
@ -91,7 +97,7 @@ impl OrchestratorService {
orchestration_request_headers.insert( orchestration_request_headers.insert(
header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER), header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER),
header::HeaderValue::from_str(PLANO_ORCHESTRATOR_MODEL_NAME).unwrap(), header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(),
); );
// Inject OpenTelemetry trace context from current span // Inject OpenTelemetry trace context from current span
@ -110,7 +116,7 @@ impl OrchestratorService {
orchestration_request_headers.insert( orchestration_request_headers.insert(
header::HeaderName::from_static("model"), header::HeaderName::from_static("model"),
header::HeaderValue::from_static(PLANO_ORCHESTRATOR_MODEL_NAME), header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(),
); );
let start_time = std::time::Instant::now(); let start_time = std::time::Instant::now();

View file

@ -7,12 +7,6 @@ use crate::api::open_ai::{
ChatCompletionTool, FunctionDefinition, FunctionParameter, FunctionParameters, ParameterType, ChatCompletionTool, FunctionDefinition, FunctionParameter, FunctionParameters, ParameterType,
}; };
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Routing {
pub model_provider: Option<String>,
pub model: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelAlias { pub struct ModelAlias {
pub target: String, pub target: String,
@ -84,7 +78,6 @@ pub struct Configuration {
pub ratelimits: Option<Vec<Ratelimit>>, pub ratelimits: Option<Vec<Ratelimit>>,
pub tracing: Option<Tracing>, pub tracing: Option<Tracing>,
pub mode: Option<GatewayMode>, pub mode: Option<GatewayMode>,
pub routing: Option<Routing>,
pub agents: Option<Vec<Agent>>, pub agents: Option<Vec<Agent>>,
pub filters: Option<Vec<Agent>>, pub filters: Option<Vec<Agent>>,
pub listeners: Vec<Listener>, pub listeners: Vec<Listener>,
@ -96,6 +89,8 @@ pub struct Overrides {
pub prompt_target_intent_matching_threshold: Option<f64>, pub prompt_target_intent_matching_threshold: Option<f64>,
pub optimize_context_window: Option<bool>, pub optimize_context_window: Option<bool>,
pub use_agent_orchestrator: Option<bool>, pub use_agent_orchestrator: Option<bool>,
pub llm_routing_model: Option<String>,
pub agent_orchestration_model: Option<String>,
} }
#[derive(Debug, Clone, Serialize, Deserialize, Default)] #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -219,8 +214,6 @@ pub struct EmbeddingProviver {
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub enum LlmProviderType { pub enum LlmProviderType {
#[serde(rename = "arch")]
Arch,
#[serde(rename = "anthropic")] #[serde(rename = "anthropic")]
Anthropic, Anthropic,
#[serde(rename = "deepseek")] #[serde(rename = "deepseek")]
@ -249,12 +242,13 @@ pub enum LlmProviderType {
Qwen, Qwen,
#[serde(rename = "amazon_bedrock")] #[serde(rename = "amazon_bedrock")]
AmazonBedrock, AmazonBedrock,
#[serde(rename = "plano")]
Plano,
} }
impl Display for LlmProviderType { impl Display for LlmProviderType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
LlmProviderType::Arch => write!(f, "arch"),
LlmProviderType::Anthropic => write!(f, "anthropic"), LlmProviderType::Anthropic => write!(f, "anthropic"),
LlmProviderType::Deepseek => write!(f, "deepseek"), LlmProviderType::Deepseek => write!(f, "deepseek"),
LlmProviderType::Groq => write!(f, "groq"), LlmProviderType::Groq => write!(f, "groq"),
@ -269,6 +263,7 @@ impl Display for LlmProviderType {
LlmProviderType::Zhipu => write!(f, "zhipu"), LlmProviderType::Zhipu => write!(f, "zhipu"),
LlmProviderType::Qwen => write!(f, "qwen"), LlmProviderType::Qwen => write!(f, "qwen"),
LlmProviderType::AmazonBedrock => write!(f, "amazon_bedrock"), LlmProviderType::AmazonBedrock => write!(f, "amazon_bedrock"),
LlmProviderType::Plano => write!(f, "plano"),
} }
} }
} }
@ -603,14 +598,14 @@ mod test {
}, },
LlmProvider { LlmProvider {
name: "arch-router".to_string(), name: "arch-router".to_string(),
provider_interface: LlmProviderType::Arch, provider_interface: LlmProviderType::Plano,
model: Some("Arch-Router".to_string()), model: Some("Arch-Router".to_string()),
internal: Some(true), internal: Some(true),
..Default::default() ..Default::default()
}, },
LlmProvider { LlmProvider {
name: "plano-orchestrator".to_string(), name: "plano-orchestrator".to_string(),
provider_interface: LlmProviderType::Arch, provider_interface: LlmProviderType::Plano,
model: Some("Plano-Orchestrator".to_string()), model: Some("Plano-Orchestrator".to_string()),
internal: Some(true), internal: Some(true),
..Default::default() ..Default::default()

View file

@ -33,5 +33,4 @@ pub const OTEL_COLLECTOR_HTTP: &str = "opentelemetry_collector_http";
pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route"; pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route";
pub const ENVOY_RETRY_HEADER: &str = "x-envoy-max-retries"; pub const ENVOY_RETRY_HEADER: &str = "x-envoy-max-retries";
pub const BRIGHT_STAFF_SERVICE_NAME: &str = "brightstaff"; pub const BRIGHT_STAFF_SERVICE_NAME: &str = "brightstaff";
pub const PLANO_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator"; pub const PLANO_FC_CLUSTER: &str = "plano";
pub const ARCH_FC_CLUSTER: &str = "arch";

View file

@ -1,183 +1,16 @@
version: '1.0' version: '1.0'
source: canonical-apis source: canonical-apis
providers: providers:
mistralai:
- mistralai/mistral-medium-2505
- mistralai/mistral-medium-2508
- mistralai/mistral-medium-latest
- mistralai/mistral-medium
- mistralai/mistral-vibe-cli-with-tools
- mistralai/open-mistral-nemo
- mistralai/open-mistral-nemo-2407
- mistralai/mistral-tiny-2407
- mistralai/mistral-tiny-latest
- mistralai/mistral-large-2411
- mistralai/pixtral-large-2411
- mistralai/pixtral-large-latest
- mistralai/mistral-large-pixtral-2411
- mistralai/codestral-2508
- mistralai/codestral-latest
- mistralai/devstral-small-2507
- mistralai/devstral-medium-2507
- mistralai/devstral-2512
- mistralai/mistral-vibe-cli-latest
- mistralai/devstral-medium-latest
- mistralai/devstral-latest
- mistralai/labs-devstral-small-2512
- mistralai/devstral-small-latest
- mistralai/mistral-small-2506
- mistralai/mistral-small-latest
- mistralai/labs-mistral-small-creative
- mistralai/magistral-medium-2509
- mistralai/magistral-medium-latest
- mistralai/magistral-small-2509
- mistralai/magistral-small-latest
- mistralai/mistral-large-2512
- mistralai/mistral-large-latest
- mistralai/ministral-3b-2512
- mistralai/ministral-3b-latest
- mistralai/ministral-8b-2512
- mistralai/ministral-8b-latest
- mistralai/ministral-14b-2512
- mistralai/ministral-14b-latest
- mistralai/mistral-small-2501
- mistralai/mistral-embed-2312
- mistralai/mistral-embed
- mistralai/codestral-embed
- mistralai/codestral-embed-2505
openai:
- openai/gpt-4-0613
- openai/gpt-4
- openai/gpt-3.5-turbo
- openai/gpt-5.2-codex
- openai/gpt-3.5-turbo-instruct
- openai/gpt-3.5-turbo-instruct-0914
- openai/gpt-4-1106-preview
- openai/gpt-3.5-turbo-1106
- openai/gpt-4-0125-preview
- openai/gpt-4-turbo-preview
- openai/gpt-3.5-turbo-0125
- openai/gpt-4-turbo
- openai/gpt-4-turbo-2024-04-09
- openai/gpt-4o
- openai/gpt-4o-2024-05-13
- openai/gpt-4o-mini-2024-07-18
- openai/gpt-4o-mini
- openai/gpt-4o-2024-08-06
- openai/chatgpt-4o-latest
- openai/o1-2024-12-17
- openai/o1
- openai/computer-use-preview
- openai/o3-mini
- openai/o3-mini-2025-01-31
- openai/gpt-4o-2024-11-20
- openai/computer-use-preview-2025-03-11
- openai/gpt-4o-search-preview-2025-03-11
- openai/gpt-4o-search-preview
- openai/gpt-4o-mini-search-preview-2025-03-11
- openai/gpt-4o-mini-search-preview
- openai/o1-pro-2025-03-19
- openai/o1-pro
- openai/o3-2025-04-16
- openai/o4-mini-2025-04-16
- openai/o3
- openai/o4-mini
- openai/gpt-4.1-2025-04-14
- openai/gpt-4.1
- openai/gpt-4.1-mini-2025-04-14
- openai/gpt-4.1-mini
- openai/gpt-4.1-nano-2025-04-14
- openai/gpt-4.1-nano
- openai/o3-pro
- openai/o3-pro-2025-06-10
- openai/o4-mini-deep-research
- openai/o3-deep-research
- openai/o3-deep-research-2025-06-26
- openai/o4-mini-deep-research-2025-06-26
- openai/gpt-5-chat-latest
- openai/gpt-5-2025-08-07
- openai/gpt-5
- openai/gpt-5-mini-2025-08-07
- openai/gpt-5-mini
- openai/gpt-5-nano-2025-08-07
- openai/gpt-5-nano
- openai/gpt-5-codex
- openai/gpt-5-pro-2025-10-06
- openai/gpt-5-pro
- openai/gpt-5-search-api
- openai/gpt-5-search-api-2025-10-14
- openai/gpt-5.1-chat-latest
- openai/gpt-5.1-2025-11-13
- openai/gpt-5.1
- openai/gpt-5.1-codex
- openai/gpt-5.1-codex-mini
- openai/gpt-5.1-codex-max
- openai/gpt-5.2-2025-12-11
- openai/gpt-5.2
- openai/gpt-5.2-pro-2025-12-11
- openai/gpt-5.2-pro
- openai/gpt-5.2-chat-latest
- openai/gpt-3.5-turbo-16k
- openai/ft:gpt-3.5-turbo-0613:katanemo::8CMZbm0P
deepseek: deepseek:
- deepseek/deepseek-chat - deepseek/deepseek-chat
- deepseek/deepseek-reasoner - deepseek/deepseek-reasoner
x-ai:
- x-ai/grok-2-vision-1212
- x-ai/grok-3
- x-ai/grok-3-mini
- x-ai/grok-4-0709
- x-ai/grok-4-1-fast-non-reasoning
- x-ai/grok-4-1-fast-reasoning
- x-ai/grok-4-fast-non-reasoning
- x-ai/grok-4-fast-reasoning
- x-ai/grok-code-fast-1
- x-ai/grok-imagine-image
- x-ai/grok-imagine-video
moonshotai:
- moonshotai/kimi-k2-thinking
- moonshotai/kimi-k2.5
- moonshotai/moonshot-v1-128k-vision-preview
- moonshotai/moonshot-v1-8k
- moonshotai/kimi-k2-turbo-preview
- moonshotai/moonshot-v1-128k
- moonshotai/moonshot-v1-32k-vision-preview
- moonshotai/kimi-k2-thinking-turbo
- moonshotai/kimi-latest
- moonshotai/moonshot-v1-32k
- moonshotai/moonshot-v1-auto
- moonshotai/kimi-k2-0711-preview
- moonshotai/kimi-k2-0905-preview
- moonshotai/moonshot-v1-8k-vision-preview
anthropic:
- anthropic/claude-opus-4-6
- anthropic/claude-opus-4-5-20251101
- anthropic/claude-opus-4-5
- anthropic/claude-haiku-4-5-20251001
- anthropic/claude-haiku-4-5
- anthropic/claude-sonnet-4-5-20250929
- anthropic/claude-sonnet-4-5
- anthropic/claude-opus-4-1-20250805
- anthropic/claude-opus-4-1
- anthropic/claude-opus-4-20250514
- anthropic/claude-opus-4
- anthropic/claude-sonnet-4-20250514
- anthropic/claude-sonnet-4
- anthropic/claude-3-7-sonnet-20250219
- anthropic/claude-3-7-sonnet
- anthropic/claude-3-5-haiku-20241022
- anthropic/claude-3-5-haiku
- anthropic/claude-3-haiku-20240307
- anthropic/claude-3-haiku
google: google:
- google/gemini-2.5-flash - google/gemini-2.5-flash
- google/gemini-2.5-pro - google/gemini-2.5-pro
- google/gemini-2.0-flash - google/gemini-2.0-flash
- google/gemini-2.0-flash-001 - google/gemini-2.0-flash-001
- google/gemini-2.0-flash-exp-image-generation
- google/gemini-2.0-flash-lite-001 - google/gemini-2.0-flash-lite-001
- google/gemini-2.0-flash-lite - google/gemini-2.0-flash-lite
- google/gemini-exp-1206
- google/gemini-2.5-flash-preview-tts - google/gemini-2.5-flash-preview-tts
- google/gemini-2.5-pro-preview-tts - google/gemini-2.5-pro-preview-tts
- google/gemma-3-1b-it - google/gemma-3-1b-it
@ -191,12 +24,15 @@ providers:
- google/gemini-pro-latest - google/gemini-pro-latest
- google/gemini-2.5-flash-lite - google/gemini-2.5-flash-lite
- google/gemini-2.5-flash-image - google/gemini-2.5-flash-image
- google/gemini-2.5-flash-preview-09-2025
- google/gemini-2.5-flash-lite-preview-09-2025 - google/gemini-2.5-flash-lite-preview-09-2025
- google/gemini-3-pro-preview - google/gemini-3-pro-preview
- google/gemini-3-flash-preview - google/gemini-3-flash-preview
- google/gemini-3.1-pro-preview
- google/gemini-3.1-pro-preview-customtools
- google/gemini-3.1-flash-lite-preview
- google/gemini-3-pro-image-preview - google/gemini-3-pro-image-preview
- google/nano-banana-pro-preview - google/nano-banana-pro-preview
- google/gemini-3.1-flash-image-preview
- google/gemini-robotics-er-1.5-preview - google/gemini-robotics-er-1.5-preview
- google/gemini-2.5-computer-use-preview-10-2025 - google/gemini-2.5-computer-use-preview-10-2025
- google/deep-research-pro-preview-12-2025 - google/deep-research-pro-preview-12-2025
@ -212,7 +48,37 @@ providers:
- amazon/amazon.nova-premier-v1:0 - amazon/amazon.nova-premier-v1:0
- amazon/amazon.nova-lite-v1:0 - amazon/amazon.nova-lite-v1:0
- amazon/amazon.nova-micro-v1:0 - amazon/amazon.nova-micro-v1:0
x-ai:
- x-ai/grok-3
- x-ai/grok-3-mini
- x-ai/grok-4-0709
- x-ai/grok-4-1-fast-non-reasoning
- x-ai/grok-4-1-fast-reasoning
- x-ai/grok-4-fast-non-reasoning
- x-ai/grok-4-fast-reasoning
- x-ai/grok-4.20-beta-0309-non-reasoning
- x-ai/grok-4.20-beta-0309-reasoning
- x-ai/grok-4.20-multi-agent-beta-0309
- x-ai/grok-code-fast-1
- x-ai/grok-imagine-image
- x-ai/grok-imagine-video
z-ai:
- z-ai/glm-4.5
- z-ai/glm-4.5-air
- z-ai/glm-4.6
- z-ai/glm-4.7
- z-ai/glm-5
qwen: qwen:
- qwen/qwen3-asr-flash-2026-02-10
- qwen/qwen3.5-flash-2026-02-23
- qwen/qwen3.5-flash
- qwen/qwen3.5-122b-a10b
- qwen/qwen3.5-35b-a3b
- qwen/qwen3.5-27b
- qwen/qwen3-coder-next
- qwen/qwen3.5-397b-a17b
- qwen/qwen3.5-plus-2026-02-15
- qwen/qwen3.5-plus
- qwen/qwen3-vl-flash-2026-01-22 - qwen/qwen3-vl-flash-2026-01-22
- qwen/qwen3-max-2026-01-23 - qwen/qwen3-max-2026-01-23
- qwen/qwen-plus-character - qwen/qwen-plus-character
@ -294,13 +160,161 @@ providers:
- qwen/qwen-max - qwen/qwen-max
- qwen/qwen-plus - qwen/qwen-plus
- qwen/qwen-turbo - qwen/qwen-turbo
z-ai: mistralai:
- z-ai/glm-4.5 - mistralai/mistral-medium-2505
- z-ai/glm-4.5-air - mistralai/mistral-medium-2508
- z-ai/glm-4.6 - mistralai/mistral-medium-latest
- z-ai/glm-4.7 - mistralai/mistral-medium
- z-ai/glm-5 - mistralai/mistral-vibe-cli-with-tools
- mistralai/open-mistral-nemo
- mistralai/open-mistral-nemo-2407
- mistralai/mistral-tiny-2407
- mistralai/mistral-tiny-latest
- mistralai/codestral-2508
- mistralai/codestral-latest
- mistralai/devstral-2512
- mistralai/mistral-vibe-cli-latest
- mistralai/devstral-medium-latest
- mistralai/devstral-latest
- mistralai/mistral-small-2506
- mistralai/mistral-small-latest
- mistralai/labs-mistral-small-creative
- mistralai/magistral-medium-2509
- mistralai/magistral-medium-latest
- mistralai/magistral-small-2509
- mistralai/magistral-small-latest
- mistralai/mistral-large-2512
- mistralai/mistral-large-latest
- mistralai/ministral-3b-2512
- mistralai/ministral-3b-latest
- mistralai/ministral-8b-2512
- mistralai/ministral-8b-latest
- mistralai/ministral-14b-2512
- mistralai/ministral-14b-latest
- mistralai/mistral-large-2411
- mistralai/pixtral-large-2411
- mistralai/pixtral-large-latest
- mistralai/mistral-large-pixtral-2411
- mistralai/devstral-small-2507
- mistralai/devstral-medium-2507
- mistralai/labs-devstral-small-2512
- mistralai/devstral-small-latest
- mistralai/mistral-squarepoint-2602
- mistralai/mistral-embed-2312
- mistralai/mistral-embed
- mistralai/codestral-embed
- mistralai/codestral-embed-2505
moonshotai:
- moonshotai/kimi-k2.5
- moonshotai/kimi-k2-0905-preview
- moonshotai/moonshot-v1-32k
- moonshotai/moonshot-v1-128k
- moonshotai/kimi-k2-thinking-turbo
- moonshotai/moonshot-v1-8k-vision-preview
- moonshotai/kimi-k2-0711-preview
- moonshotai/moonshot-v1-auto
- moonshotai/kimi-k2-thinking
- moonshotai/moonshot-v1-128k-vision-preview
- moonshotai/kimi-k2-turbo-preview
- moonshotai/moonshot-v1-32k-vision-preview
- moonshotai/moonshot-v1-8k
anthropic:
- anthropic/claude-sonnet-4-6
- anthropic/claude-opus-4-6
- anthropic/claude-opus-4-5-20251101
- anthropic/claude-opus-4-5
- anthropic/claude-haiku-4-5-20251001
- anthropic/claude-haiku-4-5
- anthropic/claude-sonnet-4-5-20250929
- anthropic/claude-sonnet-4-5
- anthropic/claude-opus-4-1-20250805
- anthropic/claude-opus-4-1
- anthropic/claude-opus-4-20250514
- anthropic/claude-opus-4
- anthropic/claude-sonnet-4-20250514
- anthropic/claude-sonnet-4
- anthropic/claude-3-haiku-20240307
- anthropic/claude-3-haiku
openai:
- openai/gpt-4-0613
- openai/gpt-4
- openai/gpt-3.5-turbo
- openai/gpt-5.4
- openai/gpt-5.3-chat-latest
- openai/gpt-5.4-2026-03-05
- openai/gpt-5.4-pro
- openai/gpt-5.4-pro-2026-03-05
- openai/gpt-3.5-turbo-instruct
- openai/gpt-3.5-turbo-instruct-0914
- openai/gpt-4-1106-preview
- openai/gpt-3.5-turbo-1106
- openai/gpt-4-0125-preview
- openai/gpt-4-turbo-preview
- openai/gpt-3.5-turbo-0125
- openai/gpt-4-turbo
- openai/gpt-4-turbo-2024-04-09
- openai/gpt-4o
- openai/gpt-4o-2024-05-13
- openai/gpt-4o-mini-2024-07-18
- openai/gpt-4o-mini
- openai/gpt-4o-2024-08-06
- openai/o1-2024-12-17
- openai/o1
- openai/computer-use-preview
- openai/o3-mini
- openai/o3-mini-2025-01-31
- openai/gpt-4o-2024-11-20
- openai/computer-use-preview-2025-03-11
- openai/gpt-4o-mini-search-preview-2025-03-11
- openai/gpt-4o-mini-search-preview
- openai/o1-pro-2025-03-19
- openai/o1-pro
- openai/o3-2025-04-16
- openai/o4-mini-2025-04-16
- openai/o3
- openai/o4-mini
- openai/gpt-4.1-2025-04-14
- openai/gpt-4.1
- openai/gpt-4.1-mini-2025-04-14
- openai/gpt-4.1-mini
- openai/gpt-4.1-nano-2025-04-14
- openai/gpt-4.1-nano
- openai/o3-pro
- openai/o3-pro-2025-06-10
- openai/o4-mini-deep-research
- openai/o3-deep-research
- openai/o3-deep-research-2025-06-26
- openai/o4-mini-deep-research-2025-06-26
- openai/gpt-5-chat-latest
- openai/gpt-5-2025-08-07
- openai/gpt-5
- openai/gpt-5-mini-2025-08-07
- openai/gpt-5-mini
- openai/gpt-5-nano-2025-08-07
- openai/gpt-5-nano
- openai/gpt-5-codex
- openai/gpt-5-pro-2025-10-06
- openai/gpt-5-pro
- openai/gpt-5-search-api
- openai/gpt-5-search-api-2025-10-14
- openai/gpt-5.1-chat-latest
- openai/gpt-5.1-2025-11-13
- openai/gpt-5.1
- openai/gpt-5.1-codex
- openai/gpt-5.1-codex-mini
- openai/gpt-5.1-codex-max
- openai/gpt-5.2-2025-12-11
- openai/gpt-5.2
- openai/gpt-5.2-pro-2025-12-11
- openai/gpt-5.2-pro
- openai/gpt-5.2-chat-latest
- openai/gpt-5.2-codex
- openai/gpt-5.3-codex
- openai/gpt-4o-search-preview
- openai/gpt-4o-search-preview-2025-03-11
- openai/gpt-3.5-turbo-16k
- openai/ft:gpt-3.5-turbo-0613:katanemo::8CMZbm0P
metadata: metadata:
total_providers: 10 total_providers: 10
total_models: 289 total_models: 303
last_updated: 2026-02-13T22:44:30.413065+00:00 last_updated: 2026-03-15T16:47:22.207197+00:00

View file

@ -35,7 +35,7 @@ mod tests {
ProviderId::Mistral ProviderId::Mistral
); );
assert_eq!(ProviderId::try_from("groq").unwrap(), ProviderId::Groq); assert_eq!(ProviderId::try_from("groq").unwrap(), ProviderId::Groq);
assert_eq!(ProviderId::try_from("arch").unwrap(), ProviderId::Arch); assert_eq!(ProviderId::try_from("plano").unwrap(), ProviderId::Plano);
// Test aliases // Test aliases
assert_eq!(ProviderId::try_from("google").unwrap(), ProviderId::Gemini); assert_eq!(ProviderId::try_from("google").unwrap(), ProviderId::Gemini);

View file

@ -34,7 +34,7 @@ pub enum ProviderId {
Gemini, Gemini,
Anthropic, Anthropic,
GitHub, GitHub,
Arch, Plano,
AzureOpenAI, AzureOpenAI,
XAI, XAI,
TogetherAI, TogetherAI,
@ -58,7 +58,7 @@ impl TryFrom<&str> for ProviderId {
"google" => Ok(ProviderId::Gemini), // alias "google" => Ok(ProviderId::Gemini), // alias
"anthropic" => Ok(ProviderId::Anthropic), "anthropic" => Ok(ProviderId::Anthropic),
"github" => Ok(ProviderId::GitHub), "github" => Ok(ProviderId::GitHub),
"arch" => Ok(ProviderId::Arch), "plano" => Ok(ProviderId::Plano),
"azure_openai" => Ok(ProviderId::AzureOpenAI), "azure_openai" => Ok(ProviderId::AzureOpenAI),
"xai" => Ok(ProviderId::XAI), "xai" => Ok(ProviderId::XAI),
"together_ai" => Ok(ProviderId::TogetherAI), "together_ai" => Ok(ProviderId::TogetherAI),
@ -135,7 +135,7 @@ impl ProviderId {
| ProviderId::Groq | ProviderId::Groq
| ProviderId::Mistral | ProviderId::Mistral
| ProviderId::Deepseek | ProviderId::Deepseek
| ProviderId::Arch | ProviderId::Plano
| ProviderId::Gemini | ProviderId::Gemini
| ProviderId::GitHub | ProviderId::GitHub
| ProviderId::AzureOpenAI | ProviderId::AzureOpenAI
@ -153,7 +153,7 @@ impl ProviderId {
| ProviderId::Groq | ProviderId::Groq
| ProviderId::Mistral | ProviderId::Mistral
| ProviderId::Deepseek | ProviderId::Deepseek
| ProviderId::Arch | ProviderId::Plano
| ProviderId::Gemini | ProviderId::Gemini
| ProviderId::GitHub | ProviderId::GitHub
| ProviderId::AzureOpenAI | ProviderId::AzureOpenAI
@ -219,7 +219,7 @@ impl Display for ProviderId {
ProviderId::Gemini => write!(f, "Gemini"), ProviderId::Gemini => write!(f, "Gemini"),
ProviderId::Anthropic => write!(f, "Anthropic"), ProviderId::Anthropic => write!(f, "Anthropic"),
ProviderId::GitHub => write!(f, "GitHub"), ProviderId::GitHub => write!(f, "GitHub"),
ProviderId::Arch => write!(f, "Arch"), ProviderId::Plano => write!(f, "Plano"),
ProviderId::AzureOpenAI => write!(f, "azure_openai"), ProviderId::AzureOpenAI => write!(f, "azure_openai"),
ProviderId::XAI => write!(f, "xai"), ProviderId::XAI => write!(f, "xai"),
ProviderId::TogetherAI => write!(f, "together_ai"), ProviderId::TogetherAI => write!(f, "together_ai"),

View file

@ -873,7 +873,7 @@ impl HttpContext for StreamContext {
// ensure that the provider has an endpoint if the access key is missing else return a bad request // ensure that the provider has an endpoint if the access key is missing else return a bad request
if self.llm_provider.as_ref().unwrap().endpoint.is_none() if self.llm_provider.as_ref().unwrap().endpoint.is_none()
&& self.llm_provider.as_ref().unwrap().provider_interface && self.llm_provider.as_ref().unwrap().provider_interface
!= LlmProviderType::Arch != LlmProviderType::Plano
{ {
self.send_server_error(error, Some(StatusCode::BAD_REQUEST)); self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
} }

View file

@ -123,6 +123,42 @@ Each agent:
Both agents run as native local processes and communicate with Plano running natively on the host. Both agents run as native local processes and communicate with Plano running natively on the host.
## Running with local Plano-Orchestrator (via vLLM)
By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model locally using vLLM on a server with an NVIDIA GPU:
1. Install vLLM and download the model:
```bash
pip install vllm
```
2. Start the vLLM server with the 4B model:
```bash
vllm serve katanemo/Plano-Orchestrator-4B \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.3 \
--tokenizer katanemo/Plano-Orchestrator-4B \
--chat-template chat_template.jinja \
--served-model-name katanemo/Plano-Orchestrator-4B \
--enable-prefix-caching
```
3. Start the demo with the local orchestrator config:
```bash
./run_demo.sh --local-orchestrator
```
4. Test with curl:
```bash
curl -X POST http://localhost:8001/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model": "gpt-5.2", "messages": [{"role": "user", "content": "What is the weather in Istanbul?"}]}'
```
You should see Plano use your local orchestrator to route the request to the weather agent.
## Observability ## Observability
This demo includes full OpenTelemetry (OTel) compatible distributed tracing to monitor and debug agent interactions: This demo includes full OpenTelemetry (OTel) compatible distributed tracing to monitor and debug agent interactions:

View file

@ -0,0 +1,66 @@
version: v0.3.0
overrides:
agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B
agents:
- id: weather_agent
url: http://localhost:10510
- id: flight_agent
url: http://localhost:10520
model_providers:
- model: plano/katanemo/Plano-Orchestrator-4B
base_url: http://localhost:8000
- model: openai/gpt-5.2
access_key: $OPENAI_API_KEY
default: true
- model: openai/gpt-4o-mini
access_key: $OPENAI_API_KEY # smaller, faster, cheaper model for extracting entities like location
listeners:
- type: agent
name: travel_booking_service
port: 8001
router: plano_orchestrator_v1
agents:
- id: weather_agent
description: |
WeatherAgent is a specialized AI assistant for real-time weather information and forecasts. It provides accurate weather data for any city worldwide using the Open-Meteo API, helping travelers plan their trips with up-to-date weather conditions.
Capabilities:
* Get real-time weather conditions and multi-day forecasts for any city worldwide using Open-Meteo API (free, no API key needed)
* Provides current temperature
* Provides multi-day forecasts
* Provides weather conditions
* Provides sunrise/sunset times
* Provides detailed weather information
* Understands conversation context to resolve location references from previous messages
* Handles weather-related questions including "What's the weather in [city]?", "What's the forecast for [city]?", "How's the weather in [city]?"
* When queries include both weather and other travel questions (e.g., flights, currency), this agent answers ONLY the weather part
- id: flight_agent
description: |
FlightAgent is an AI-powered tool specialized in providing live flight information between airports. It leverages the FlightAware AeroAPI to deliver real-time flight status, gate information, and delay updates.
Capabilities:
* Get live flight information between airports using FlightAware AeroAPI
* Shows real-time flight status
* Shows scheduled/estimated/actual departure and arrival times
* Shows gate and terminal information
* Shows delays
* Shows aircraft type
* Shows flight status
* Automatically resolves city names to airport codes (IATA/ICAO)
* Understands conversation context to infer origin/destination from follow-up questions
* Handles flight-related questions including "What flights go from [city] to [city]?", "Do flights go to [city]?", "Are there direct flights from [city]?"
* When queries include both flight and other travel questions (e.g., weather, currency), this agent answers ONLY the flight part
tracing:
random_sampling: 100
span_attributes:
header_prefixes:
- x-acme-

View file

@ -31,8 +31,13 @@ start_demo() {
fi fi
# Step 4: Start Plano # Step 4: Start Plano
echo "Starting Plano with config.yaml..." PLANO_CONFIG="config.yaml"
planoai up config.yaml if [ "$1" == "--local-orchestrator" ]; then
PLANO_CONFIG="config_local_orchestrator.yaml"
echo "Using local orchestrator config..."
fi
echo "Starting Plano with $PLANO_CONFIG..."
planoai up "$PLANO_CONFIG"
# Step 5: Start agents natively # Step 5: Start agents natively
echo "Starting agents..." echo "Starting agents..."

View file

@ -1,6 +1,54 @@
# Model Routing Service Demo # Model Routing Service Demo
This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select. Plano is an AI-native proxy and data plane for agentic apps — with built-in orchestration, safety, observability, and intelligent LLM routing.
```
┌───────────┐ ┌─────────────────────────────────┐ ┌──────────────┐
│ Client │ ───► │ Plano │ ───► │ OpenAI │
│ (any │ │ │ │ Anthropic │
│ language)│ │ Arch-Router (1.5B model) │ │ Any Provider│
└───────────┘ │ analyzes intent → picks model │ └──────────────┘
└─────────────────────────────────┘
```
- **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover
- **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request
- **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code
- **Runs anywhere** — single binary; self-host the router for full data privacy
## How Routing Works
The entire routing configuration is plain YAML — no code:
```yaml
model_providers:
- model: openai/gpt-4o-mini
default: true # fallback for unmatched requests
- model: openai/gpt-4o
routing_preferences:
- name: complex_reasoning
description: complex reasoning tasks, multi-step analysis
- model: anthropic/claude-sonnet-4-20250514
routing_preferences:
- name: code_generation
description: generating new code, writing functions
```
When a request arrives, Plano sends the conversation and routing preferences to Arch-Router, which classifies the intent and returns the matching route:
```
1. Request arrives → "Write binary search in Python"
2. Preferences serialized → [{"name":"code_generation", ...}, {"name":"complex_reasoning", ...}]
3. Arch-Router classifies → {"route": "code_generation"}
4. Route → Model lookup → code_generation → anthropic/claude-sonnet-4-20250514
5. Request forwarded → Claude generates the response
```
No match? Arch-Router returns `other` → Plano falls back to the default model.
The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing and validating routing behavior before going to production.
## Setup ## Setup
@ -55,6 +103,69 @@ Response:
The response tells you which model would handle this request and which route was matched, without actually making the LLM call. The response tells you which model would handle this request and which route was matched, without actually making the LLM call.
## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint:
**0. Check your GPU node labels and taints**
```bash
kubectl get nodes --show-labels | grep -i gpu
kubectl get node <gpu-node-name> -o jsonpath='{.spec.taints}'
```
GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment.yaml` includes a matching toleration. If you have multiple GPU node pools and need to pin to a specific one, uncomment and set the `nodeSelector` in `vllm-deployment.yaml` using the label for your cloud provider.
**1. Deploy Arch-Router and Plano:**
```bash
# arch-router deployment
kubectl apply -f vllm-deployment.yaml
# plano deployment
kubectl create secret generic plano-secrets \
--from-literal=OPENAI_API_KEY=$OPENAI_API_KEY \
--from-literal=ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY
kubectl create configmap plano-config \
--from-file=plano_config.yaml=config_k8s.yaml \
--dry-run=client -o yaml | kubectl apply -f -
kubectl apply -f plano-deployment.yaml
```
**3. Wait for both pods to be ready:**
```bash
# Arch-Router downloads the model (~1 min) then vLLM loads it (~2 min)
kubectl get pods -l app=arch-router -w
kubectl rollout status deployment/plano
```
**4. Test:**
```bash
kubectl port-forward svc/plano 12000:12000
./demo.sh
```
To confirm requests are hitting your in-cluster Arch-Router (not just health checks):
```bash
kubectl logs -l app=arch-router -f --tail=0
# Look for POST /v1/chat/completions entries
```
**Updating the config:**
```bash
kubectl create configmap plano-config \
--from-file=plano_config.yaml=config_k8s.yaml \
--dry-run=client -o yaml | kubectl apply -f -
kubectl rollout restart deployment/plano
```
## Demo Output ## Demo Output
``` ```

View file

@ -0,0 +1,33 @@
version: v0.3.0
overrides:
llm_routing_model: plano/Arch-Router
listeners:
- type: model
name: model_listener
port: 12000
model_providers:
- model: plano/Arch-Router
base_url: http://arch-router:10000
- model: openai/gpt-4o-mini
access_key: $OPENAI_API_KEY
default: true
- model: openai/gpt-4o
access_key: $OPENAI_API_KEY
routing_preferences:
- name: complex_reasoning
description: complex reasoning tasks, multi-step analysis, or detailed explanations
- model: anthropic/claude-sonnet-4-20250514
access_key: $ANTHROPIC_API_KEY
routing_preferences:
- name: code_generation
description: generating new code, writing functions, or creating boilerplate
tracing:
random_sampling: 100

View file

@ -0,0 +1,68 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: plano
labels:
app: plano
spec:
replicas: 1
selector:
matchLabels:
app: plano
template:
metadata:
labels:
app: plano
spec:
containers:
- name: plano
image: katanemo/plano:0.4.12
ports:
- containerPort: 12000 # LLM gateway (chat completions, model routing)
name: llm-gateway
envFrom:
- secretRef:
name: plano-secrets
env:
- name: LOG_LEVEL
value: "info"
volumeMounts:
- name: plano-config
mountPath: /app/plano_config.yaml
subPath: plano_config.yaml
readOnly: true
readinessProbe:
httpGet:
path: /healthz
port: 12000
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /healthz
port: 12000
initialDelaySeconds: 10
periodSeconds: 30
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "1000m"
volumes:
- name: plano-config
configMap:
name: plano-config
---
apiVersion: v1
kind: Service
metadata:
name: plano
spec:
selector:
app: plano
ports:
- name: llm-gateway
port: 12000
targetPort: 12000

View file

@ -0,0 +1,36 @@
### Code generation query (OpenAI format) — expects anthropic/claude-sonnet
POST http://localhost:12000/routing/v1/chat/completions
Content-Type: application/json
{
"model": "gpt-4o-mini",
"messages": [{"role": "user", "content": "Write a Python function for binary search"}]
}
### Complex reasoning query (OpenAI format) — expects openai/gpt-4o
POST http://localhost:12000/routing/v1/chat/completions
Content-Type: application/json
{
"model": "gpt-4o-mini",
"messages": [{"role": "user", "content": "Analyze the trade-offs between microservices and monolithic architecture"}]
}
### Simple query — no routing match, expects default model
POST http://localhost:12000/routing/v1/chat/completions
Content-Type: application/json
{
"model": "gpt-4o-mini",
"messages": [{"role": "user", "content": "Hello"}]
}
### Code generation query (Anthropic format)
POST http://localhost:12000/routing/v1/messages
Content-Type: application/json
{
"model": "claude-sonnet-4-20250514",
"max_tokens": 1024,
"messages": [{"role": "user", "content": "Write a REST API in Go using Gin"}]
}

View file

@ -0,0 +1,104 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: arch-router
labels:
app: arch-router
spec:
replicas: 1
selector:
matchLabels:
app: arch-router
template:
metadata:
labels:
app: arch-router
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Optional: add a nodeSelector to pin to a specific GPU node pool.
# The nvidia.com/gpu resource request below is sufficient for most clusters.
# nodeSelector:
# DigitalOcean: doks.digitalocean.com/gpu-model: l40s
# GKE: cloud.google.com/gke-accelerator: nvidia-l4
# EKS: eks.amazonaws.com/nodegroup: gpu-nodes
# AKS: kubernetes.azure.com/agentpool: gpupool
initContainers:
- name: download-model
image: python:3.11-slim
command:
- sh
- -c
- |
pip install huggingface_hub[cli] && \
python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')"
volumeMounts:
- name: model-cache
mountPath: /models
containers:
- name: vllm
image: vllm/vllm-openai:latest
command:
- vllm
- serve
- /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf
- "--host"
- "0.0.0.0"
- "--port"
- "10000"
- "--load-format"
- "gguf"
- "--tokenizer"
- "katanemo/Arch-Router-1.5B"
- "--served-model-name"
- "Arch-Router"
- "--gpu-memory-utilization"
- "0.3"
- "--tensor-parallel-size"
- "1"
- "--enable-prefix-caching"
ports:
- name: http
containerPort: 10000
protocol: TCP
resources:
requests:
cpu: "1"
memory: "4Gi"
nvidia.com/gpu: "1"
limits:
cpu: "4"
memory: "8Gi"
nvidia.com/gpu: "1"
volumeMounts:
- name: model-cache
mountPath: /models
readinessProbe:
httpGet:
path: /health
port: 10000
initialDelaySeconds: 60
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: 10000
initialDelaySeconds: 180
periodSeconds: 30
volumes:
- name: model-cache
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: arch-router
spec:
selector:
app: arch-router
ports:
- name: http
port: 10000
targetPort: 10000

View file

@ -1,8 +1,7 @@
version: v0.1.0 version: v0.1.0
routing: overrides:
model: Arch-Router llm_routing_model: Arch-Router
llm_provider: arch-router
listeners: listeners:
egress_traffic: egress_traffic:

View file

@ -1,8 +1,7 @@
version: v0.3.0 version: v0.3.0
routing: overrides:
model: Arch-Router llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
llm_provider: arch-router
listeners: listeners:
- type: model - type: model
@ -11,8 +10,7 @@ listeners:
model_providers: model_providers:
- name: arch-router - model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
base_url: http://localhost:11434 base_url: http://localhost:11434
- model: openai/gpt-4o-mini - model: openai/gpt-4o-mini

View file

@ -17,7 +17,7 @@ from sphinxawesome_theme.postprocess import Icons
project = "Plano Docs" project = "Plano Docs"
copyright = "2025, Katanemo Labs, Inc" copyright = "2025, Katanemo Labs, Inc"
author = "Katanemo Labs, Inc" author = "Katanemo Labs, Inc"
release = " v0.4.11" release = " v0.4.12"
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

View file

@ -43,7 +43,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
.. code-block:: console .. code-block:: console
$ uv tool install planoai==0.4.11 $ uv tool install planoai==0.4.12
**Option 2: Install with pip (Traditional)** **Option 2: Install with pip (Traditional)**
@ -51,7 +51,7 @@ Plano's CLI allows you to manage and interact with the Plano efficiently. To ins
$ python -m venv venv $ python -m venv venv
$ source venv/bin/activate # On Windows, use: venv\Scripts\activate $ source venv/bin/activate # On Windows, use: venv\Scripts\activate
$ pip install planoai==0.4.11 $ pip install planoai==0.4.12
.. _llm_routing_quickstart: .. _llm_routing_quickstart:

View file

@ -253,13 +253,11 @@ Using Ollama (recommended for local development)
.. code-block:: yaml .. code-block:: yaml
routing: overrides:
model: Arch-Router llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
llm_provider: arch-router
model_providers: model_providers:
- name: arch-router - model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
base_url: http://localhost:11434 base_url: http://localhost:11434
- model: openai/gpt-5.2 - model: openai/gpt-5.2
@ -324,13 +322,11 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
.. code-block:: yaml .. code-block:: yaml
routing: overrides:
model: Arch-Router llm_routing_model: plano/Arch-Router
llm_provider: arch-router
model_providers: model_providers:
- name: arch-router - model: plano/Arch-Router
model: Arch-Router
base_url: http://<your-server-ip>:10000 base_url: http://<your-server-ip>:10000
- model: openai/gpt-5.2 - model: openai/gpt-5.2
@ -351,6 +347,35 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
curl http://localhost:10000/v1/models curl http://localhost:10000/v1/models
Using vLLM on Kubernetes (GPU nodes)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For teams running Kubernetes, Arch-Router and Plano can be deployed as in-cluster services.
The ``demos/llm_routing/model_routing_service/`` directory includes ready-to-use manifests:
- ``vllm-deployment.yaml`` — Arch-Router served by vLLM, with an init container to download
the model from HuggingFace
- ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Arch-Router
- ``config_k8s.yaml`` — Plano config with ``llm_routing_model`` pointing at
``http://arch-router:10000`` instead of the default hosted endpoint
Key things to know before deploying:
- GPU nodes commonly have a ``nvidia.com/gpu:NoSchedule`` taint — the ``vllm-deployment.yaml``
includes a matching toleration. The ``nvidia.com/gpu: "1"`` resource request is sufficient
for scheduling in most clusters; a ``nodeSelector`` is optional and commented out in the
manifest for cases where you need to pin to a specific GPU node pool.
- Model download takes ~1 minute; vLLM loads the model in ~1-2 minutes after that. The
``livenessProbe`` has a 180-second ``initialDelaySeconds`` to avoid premature restarts.
- The Plano config ConfigMap must use ``--from-file=plano_config.yaml=config_k8s.yaml`` with
``subPath`` in the Deployment — omitting ``subPath`` causes Kubernetes to mount a directory
instead of a file.
For the canonical Plano Kubernetes deployment (ConfigMap, Secrets, Deployment YAML), see
:ref:`deployment`. For full step-by-step commands specific to this demo, see the
`demo README <https://github.com/katanemo/plano/tree/main/demos/llm_routing/model_routing_service/README.md>`_.
Combining Routing Methods Combining Routing Methods
------------------------- -------------------------

View file

@ -335,6 +335,90 @@ Combine RAG agents for documentation lookup with specialized troubleshooting age
- id: troubleshoot_agent - id: troubleshoot_agent
description: Diagnoses and resolves technical issues step by step description: Diagnoses and resolves technical issues step by step
Self-hosting Plano-Orchestrator
-------------------------------
By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model, you can serve it using **vLLM** on a server with an NVIDIA GPU.
.. note::
vLLM requires a Linux server with an NVIDIA GPU (CUDA). For local development on macOS, a GGUF version for Ollama is coming soon.
The following model variants are available on HuggingFace:
* `Plano-Orchestrator-4B <https://huggingface.co/katanemo/Plano-Orchestrator-4B>`_ — lighter model, suitable for development and testing
* `Plano-Orchestrator-4B-FP8 <https://huggingface.co/katanemo/Plano-Orchestrator-4B-FP8>`_ — FP8 quantized 4B model, lower memory usage
* `Plano-Orchestrator-30B-A3B <https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B>`_ — full-size model for production
* `Plano-Orchestrator-30B-A3B-FP8 <https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B-FP8>`_ — FP8 quantized 30B model, recommended for production deployments
Using vLLM
~~~~~~~~~~
1. **Install vLLM**
.. code-block:: bash
pip install vllm
2. **Download the model and chat template**
.. code-block:: bash
pip install huggingface_hub
huggingface-cli download katanemo/Plano-Orchestrator-4B
3. **Start the vLLM server**
For the 4B model (development):
.. code-block:: bash
vllm serve katanemo/Plano-Orchestrator-4B \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.3 \
--tokenizer katanemo/Plano-Orchestrator-4B \
--chat-template chat_template.jinja \
--served-model-name katanemo/Plano-Orchestrator-4B \
--enable-prefix-caching
For the 30B-A3B-FP8 model (production):
.. code-block:: bash
vllm serve katanemo/Plano-Orchestrator-30B-A3B-FP8 \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--tokenizer katanemo/Plano-Orchestrator-30B-A3B-FP8 \
--chat-template chat_template.jinja \
--max-model-len 32768 \
--served-model-name katanemo/Plano-Orchestrator-30B-A3B-FP8 \
--enable-prefix-caching
4. **Configure Plano to use the local orchestrator**
Use the model name matching your ``--served-model-name``:
.. code-block:: yaml
overrides:
agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B
model_providers:
- model: katanemo/Plano-Orchestrator-4B
provider_interface: plano
base_url: http://<your-server-ip>:8000
5. **Verify the server is running**
.. code-block:: bash
curl http://localhost:8000/health
curl http://localhost:8000/v1/models
Next Steps Next Steps
---------- ----------

View file

@ -65,7 +65,7 @@ Create a ``docker-compose.yml`` file with the following configuration:
# docker-compose.yml # docker-compose.yml
services: services:
plano: plano:
image: katanemo/plano:0.4.11 image: katanemo/plano:0.4.12
container_name: plano container_name: plano
ports: ports:
- "10000:10000" # ingress (client -> plano) - "10000:10000" # ingress (client -> plano)
@ -153,7 +153,7 @@ Create a ``plano-deployment.yaml``:
spec: spec:
containers: containers:
- name: plano - name: plano
image: katanemo/plano:0.4.11 image: katanemo/plano:0.4.12
ports: ports:
- containerPort: 12000 # LLM gateway (chat completions, model routing) - containerPort: 12000 # LLM gateway (chat completions, model routing)
name: llm-gateway name: llm-gateway

View file

@ -107,11 +107,11 @@ model_providers:
- internal: true - internal: true
model: Arch-Function model: Arch-Function
name: arch-function name: arch-function
provider_interface: arch provider_interface: plano
- internal: true - internal: true
model: Plano-Orchestrator model: Plano-Orchestrator
name: plano-orchestrator name: plano/orchestrator
provider_interface: arch provider_interface: plano
prompt_targets: prompt_targets:
- description: Get current weather at a location. - description: Get current weather at a location.
endpoint: endpoint: