mirror of
https://github.com/katanemo/plano.git
synced 2026-04-25 00:36:34 +02:00
Unified overrides for custom router and orchestrator models (#820)
* support configurable orchestrator model via orchestration config section * add self-hosting docs and demo for Plano-Orchestrator * list all Plano-Orchestrator model variants in docs * use overrides for custom routing and orchestration model * update docs * update orchestrator model name * rename arch provider to plano, use llm_routing_model and agent_orchestration_model * regenerate rendered config reference
This commit is contained in:
parent
785bf7e021
commit
bc059aed4d
20 changed files with 312 additions and 103 deletions
|
|
@ -8,13 +8,12 @@ from urllib.parse import urlparse
|
|||
from copy import deepcopy
|
||||
from planoai.consts import DEFAULT_OTEL_TRACING_GRPC_ENDPOINT
|
||||
|
||||
|
||||
SUPPORTED_PROVIDERS_WITH_BASE_URL = [
|
||||
"azure_openai",
|
||||
"ollama",
|
||||
"qwen",
|
||||
"amazon_bedrock",
|
||||
"arch",
|
||||
"plano",
|
||||
]
|
||||
|
||||
SUPPORTED_PROVIDERS_WITHOUT_BASE_URL = [
|
||||
|
|
@ -368,47 +367,52 @@ def validate_and_render_schema():
|
|||
llms_with_endpoint.append(model_provider)
|
||||
llms_with_endpoint_cluster_names.add(cluster_name)
|
||||
|
||||
if len(model_usage_name_keys) > 0:
|
||||
routing_model_provider = config_yaml.get("routing", {}).get(
|
||||
"model_provider", None
|
||||
overrides_config = config_yaml.get("overrides", {})
|
||||
# Build lookup of model names (already prefix-stripped by config processing)
|
||||
model_name_set = {mp.get("model") for mp in updated_model_providers}
|
||||
|
||||
# Auto-add arch-router provider if routing preferences exist and no provider matches the router model
|
||||
router_model = overrides_config.get("llm_routing_model", "Arch-Router")
|
||||
# Strip provider prefix for comparison since config processing strips prefixes from model names
|
||||
router_model_id = (
|
||||
router_model.split("/", 1)[1] if "/" in router_model else router_model
|
||||
)
|
||||
if len(model_usage_name_keys) > 0 and router_model_id not in model_name_set:
|
||||
updated_model_providers.append(
|
||||
{
|
||||
"name": "arch-router",
|
||||
"provider_interface": "plano",
|
||||
"model": router_model_id,
|
||||
"internal": True,
|
||||
}
|
||||
)
|
||||
if (
|
||||
routing_model_provider
|
||||
and routing_model_provider not in model_provider_name_set
|
||||
):
|
||||
raise Exception(
|
||||
f"Routing model_provider {routing_model_provider} is not defined in model_providers"
|
||||
)
|
||||
if (
|
||||
routing_model_provider is None
|
||||
and "arch-router" not in model_provider_name_set
|
||||
):
|
||||
updated_model_providers.append(
|
||||
{
|
||||
"name": "arch-router",
|
||||
"provider_interface": "arch",
|
||||
"model": config_yaml.get("routing", {}).get("model", "Arch-Router"),
|
||||
"internal": True,
|
||||
}
|
||||
)
|
||||
|
||||
# Always add arch-function model provider if not already defined
|
||||
if "arch-function" not in model_provider_name_set:
|
||||
updated_model_providers.append(
|
||||
{
|
||||
"name": "arch-function",
|
||||
"provider_interface": "arch",
|
||||
"provider_interface": "plano",
|
||||
"model": "Arch-Function",
|
||||
"internal": True,
|
||||
}
|
||||
)
|
||||
|
||||
if "plano-orchestrator" not in model_provider_name_set:
|
||||
# Auto-add plano-orchestrator provider if no provider matches the orchestrator model
|
||||
orchestrator_model = overrides_config.get(
|
||||
"agent_orchestration_model", "Plano-Orchestrator"
|
||||
)
|
||||
orchestrator_model_id = (
|
||||
orchestrator_model.split("/", 1)[1]
|
||||
if "/" in orchestrator_model
|
||||
else orchestrator_model
|
||||
)
|
||||
if orchestrator_model_id not in model_name_set:
|
||||
updated_model_providers.append(
|
||||
{
|
||||
"name": "plano-orchestrator",
|
||||
"provider_interface": "arch",
|
||||
"model": "Plano-Orchestrator",
|
||||
"name": "plano/orchestrator",
|
||||
"provider_interface": "plano",
|
||||
"model": orchestrator_model_id,
|
||||
"internal": True,
|
||||
}
|
||||
)
|
||||
|
|
|
|||
|
|
@ -594,13 +594,13 @@ static_resources:
|
|||
|
||||
clusters:
|
||||
|
||||
- name: arch
|
||||
- name: plano
|
||||
connect_timeout: {{ upstream_connect_timeout | default('5s') }}
|
||||
type: LOGICAL_DNS
|
||||
dns_lookup_family: V4_ONLY
|
||||
lb_policy: ROUND_ROBIN
|
||||
load_assignment:
|
||||
cluster_name: arch
|
||||
cluster_name: plano
|
||||
endpoints:
|
||||
- lb_endpoints:
|
||||
- endpoint:
|
||||
|
|
|
|||
|
|
@ -173,7 +173,7 @@ properties:
|
|||
provider_interface:
|
||||
type: string
|
||||
enum:
|
||||
- arch
|
||||
- plano
|
||||
- claude
|
||||
- deepseek
|
||||
- groq
|
||||
|
|
@ -220,7 +220,7 @@ properties:
|
|||
provider_interface:
|
||||
type: string
|
||||
enum:
|
||||
- arch
|
||||
- plano
|
||||
- claude
|
||||
- deepseek
|
||||
- groq
|
||||
|
|
@ -271,6 +271,12 @@ properties:
|
|||
upstream_tls_ca_path:
|
||||
type: string
|
||||
description: "Path to the trusted CA bundle for upstream TLS verification. Default is '/etc/ssl/certs/ca-certificates.crt'."
|
||||
llm_routing_model:
|
||||
type: string
|
||||
description: "Model name for the LLM router (e.g., 'Arch-Router'). Must match a model in model_providers."
|
||||
agent_orchestration_model:
|
||||
type: string
|
||||
description: "Model name for the agent orchestrator (e.g., 'Plano-Orchestrator'). Must match a model in model_providers."
|
||||
system_prompt:
|
||||
type: string
|
||||
prompt_targets:
|
||||
|
|
@ -408,14 +414,6 @@ properties:
|
|||
enum:
|
||||
- llm
|
||||
- prompt
|
||||
routing:
|
||||
type: object
|
||||
properties:
|
||||
llm_provider:
|
||||
type: string
|
||||
model:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
state_storage:
|
||||
type: object
|
||||
properties:
|
||||
|
|
|
|||
|
|
@ -178,6 +178,7 @@ mod tests {
|
|||
Arc::new(OrchestratorService::new(
|
||||
"http://localhost:8080".to_string(),
|
||||
"test-model".to_string(),
|
||||
"plano-orchestrator".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ mod tests {
|
|||
Arc::new(OrchestratorService::new(
|
||||
"http://localhost:8080".to_string(),
|
||||
"test-model".to_string(),
|
||||
"plano-orchestrator".to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -11,9 +11,7 @@ use brightstaff::state::StateStorage;
|
|||
use brightstaff::utils::tracing::init_tracer;
|
||||
use bytes::Bytes;
|
||||
use common::configuration::{Agent, Configuration};
|
||||
use common::consts::{
|
||||
CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH, PLANO_ORCHESTRATOR_MODEL_NAME,
|
||||
};
|
||||
use common::consts::{CHAT_COMPLETIONS_PATH, MESSAGES_PATH, OPENAI_RESPONSES_API_PATH};
|
||||
use common::llm_providers::LlmProviders;
|
||||
use http_body_util::{combinators::BoxBody, BodyExt, Empty};
|
||||
use hyper::body::Incoming;
|
||||
|
|
@ -35,6 +33,8 @@ pub mod router;
|
|||
const BIND_ADDRESS: &str = "0.0.0.0:9091";
|
||||
const DEFAULT_ROUTING_LLM_PROVIDER: &str = "arch-router";
|
||||
const DEFAULT_ROUTING_MODEL_NAME: &str = "Arch-Router";
|
||||
const DEFAULT_ORCHESTRATOR_LLM_PROVIDER: &str = "plano-orchestrator";
|
||||
const DEFAULT_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
|
||||
|
||||
// Utility function to extract the context from the incoming request headers
|
||||
fn extract_context_from_request(req: &Request<Incoming>) -> Context {
|
||||
|
|
@ -90,16 +90,21 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|||
env::var("LLM_PROVIDER_ENDPOINT").unwrap_or_else(|_| "http://localhost:12001".to_string());
|
||||
|
||||
let listener = TcpListener::bind(bind_address).await?;
|
||||
let routing_model_name: String = plano_config
|
||||
.routing
|
||||
.as_ref()
|
||||
.and_then(|r| r.model.clone())
|
||||
.unwrap_or_else(|| DEFAULT_ROUTING_MODEL_NAME.to_string());
|
||||
let overrides = plano_config.overrides.clone().unwrap_or_default();
|
||||
|
||||
// Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests
|
||||
let routing_model_name: String = overrides
|
||||
.llm_routing_model
|
||||
.as_deref()
|
||||
.map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
|
||||
.unwrap_or(DEFAULT_ROUTING_MODEL_NAME)
|
||||
.to_string();
|
||||
|
||||
let routing_llm_provider = plano_config
|
||||
.routing
|
||||
.as_ref()
|
||||
.and_then(|r| r.model_provider.clone())
|
||||
.model_providers
|
||||
.iter()
|
||||
.find(|p| p.model.as_deref() == Some(routing_model_name.as_str()))
|
||||
.map(|p| p.name.clone())
|
||||
.unwrap_or_else(|| DEFAULT_ROUTING_LLM_PROVIDER.to_string());
|
||||
|
||||
let router_service: Arc<RouterService> = Arc::new(RouterService::new(
|
||||
|
|
@ -109,9 +114,25 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|||
routing_llm_provider,
|
||||
));
|
||||
|
||||
// Strip provider prefix (e.g. "arch/") to get the model ID used in upstream requests
|
||||
let orchestrator_model_name: String = overrides
|
||||
.agent_orchestration_model
|
||||
.as_deref()
|
||||
.map(|m| m.split_once('/').map(|(_, id)| id).unwrap_or(m))
|
||||
.unwrap_or(DEFAULT_ORCHESTRATOR_MODEL_NAME)
|
||||
.to_string();
|
||||
|
||||
let orchestrator_llm_provider: String = plano_config
|
||||
.model_providers
|
||||
.iter()
|
||||
.find(|p| p.model.as_deref() == Some(orchestrator_model_name.as_str()))
|
||||
.map(|p| p.name.clone())
|
||||
.unwrap_or_else(|| DEFAULT_ORCHESTRATOR_LLM_PROVIDER.to_string());
|
||||
|
||||
let orchestrator_service: Arc<OrchestratorService> = Arc::new(OrchestratorService::new(
|
||||
format!("{llm_provider_url}{CHAT_COMPLETIONS_PATH}"),
|
||||
PLANO_ORCHESTRATOR_MODEL_NAME.to_string(),
|
||||
orchestrator_model_name,
|
||||
orchestrator_llm_provider,
|
||||
));
|
||||
|
||||
let model_aliases = Arc::new(plano_config.model_aliases.clone());
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ use std::{collections::HashMap, sync::Arc};
|
|||
|
||||
use common::{
|
||||
configuration::{AgentUsagePreference, OrchestrationPreference},
|
||||
consts::{ARCH_PROVIDER_HINT_HEADER, PLANO_ORCHESTRATOR_MODEL_NAME, REQUEST_ID_HEADER},
|
||||
consts::{ARCH_PROVIDER_HINT_HEADER, REQUEST_ID_HEADER},
|
||||
};
|
||||
use hermesllm::apis::openai::{ChatCompletionsResponse, Message};
|
||||
use hyper::header;
|
||||
|
|
@ -19,6 +19,7 @@ pub struct OrchestratorService {
|
|||
orchestrator_url: String,
|
||||
client: reqwest::Client,
|
||||
orchestrator_model: Arc<dyn OrchestratorModel>,
|
||||
orchestrator_provider_name: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
|
|
@ -36,7 +37,11 @@ pub enum OrchestrationError {
|
|||
pub type Result<T> = std::result::Result<T, OrchestrationError>;
|
||||
|
||||
impl OrchestratorService {
|
||||
pub fn new(orchestrator_url: String, orchestration_model_name: String) -> Self {
|
||||
pub fn new(
|
||||
orchestrator_url: String,
|
||||
orchestration_model_name: String,
|
||||
orchestrator_provider_name: String,
|
||||
) -> Self {
|
||||
// Empty agent orchestrations - will be provided via usage_preferences in requests
|
||||
let agent_orchestrations: HashMap<String, Vec<OrchestrationPreference>> = HashMap::new();
|
||||
|
||||
|
|
@ -50,6 +55,7 @@ impl OrchestratorService {
|
|||
orchestrator_url,
|
||||
client: reqwest::Client::new(),
|
||||
orchestrator_model,
|
||||
orchestrator_provider_name,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -75,12 +81,12 @@ impl OrchestratorService {
|
|||
debug!(
|
||||
model = %self.orchestrator_model.get_model_name(),
|
||||
endpoint = %self.orchestrator_url,
|
||||
"sending request to arch-orchestrator"
|
||||
"sending request to plano-orchestrator"
|
||||
);
|
||||
|
||||
debug!(
|
||||
body = %serde_json::to_string(&orchestrator_request).unwrap(),
|
||||
"arch orchestrator request"
|
||||
"plano orchestrator request"
|
||||
);
|
||||
|
||||
let mut orchestration_request_headers = header::HeaderMap::new();
|
||||
|
|
@ -91,7 +97,7 @@ impl OrchestratorService {
|
|||
|
||||
orchestration_request_headers.insert(
|
||||
header::HeaderName::from_static(ARCH_PROVIDER_HINT_HEADER),
|
||||
header::HeaderValue::from_str(PLANO_ORCHESTRATOR_MODEL_NAME).unwrap(),
|
||||
header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(),
|
||||
);
|
||||
|
||||
// Inject OpenTelemetry trace context from current span
|
||||
|
|
@ -110,7 +116,7 @@ impl OrchestratorService {
|
|||
|
||||
orchestration_request_headers.insert(
|
||||
header::HeaderName::from_static("model"),
|
||||
header::HeaderValue::from_static(PLANO_ORCHESTRATOR_MODEL_NAME),
|
||||
header::HeaderValue::from_str(&self.orchestrator_provider_name).unwrap(),
|
||||
);
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
|
|
|
|||
|
|
@ -7,12 +7,6 @@ use crate::api::open_ai::{
|
|||
ChatCompletionTool, FunctionDefinition, FunctionParameter, FunctionParameters, ParameterType,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Routing {
|
||||
pub model_provider: Option<String>,
|
||||
pub model: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ModelAlias {
|
||||
pub target: String,
|
||||
|
|
@ -72,7 +66,6 @@ pub struct Configuration {
|
|||
pub ratelimits: Option<Vec<Ratelimit>>,
|
||||
pub tracing: Option<Tracing>,
|
||||
pub mode: Option<GatewayMode>,
|
||||
pub routing: Option<Routing>,
|
||||
pub agents: Option<Vec<Agent>>,
|
||||
pub filters: Option<Vec<Agent>>,
|
||||
pub listeners: Vec<Listener>,
|
||||
|
|
@ -84,6 +77,8 @@ pub struct Overrides {
|
|||
pub prompt_target_intent_matching_threshold: Option<f64>,
|
||||
pub optimize_context_window: Option<bool>,
|
||||
pub use_agent_orchestrator: Option<bool>,
|
||||
pub llm_routing_model: Option<String>,
|
||||
pub agent_orchestration_model: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
|
|
@ -207,8 +202,6 @@ pub struct EmbeddingProviver {
|
|||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
pub enum LlmProviderType {
|
||||
#[serde(rename = "arch")]
|
||||
Arch,
|
||||
#[serde(rename = "anthropic")]
|
||||
Anthropic,
|
||||
#[serde(rename = "deepseek")]
|
||||
|
|
@ -237,12 +230,13 @@ pub enum LlmProviderType {
|
|||
Qwen,
|
||||
#[serde(rename = "amazon_bedrock")]
|
||||
AmazonBedrock,
|
||||
#[serde(rename = "plano")]
|
||||
Plano,
|
||||
}
|
||||
|
||||
impl Display for LlmProviderType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
LlmProviderType::Arch => write!(f, "arch"),
|
||||
LlmProviderType::Anthropic => write!(f, "anthropic"),
|
||||
LlmProviderType::Deepseek => write!(f, "deepseek"),
|
||||
LlmProviderType::Groq => write!(f, "groq"),
|
||||
|
|
@ -257,6 +251,7 @@ impl Display for LlmProviderType {
|
|||
LlmProviderType::Zhipu => write!(f, "zhipu"),
|
||||
LlmProviderType::Qwen => write!(f, "qwen"),
|
||||
LlmProviderType::AmazonBedrock => write!(f, "amazon_bedrock"),
|
||||
LlmProviderType::Plano => write!(f, "plano"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -591,14 +586,14 @@ mod test {
|
|||
},
|
||||
LlmProvider {
|
||||
name: "arch-router".to_string(),
|
||||
provider_interface: LlmProviderType::Arch,
|
||||
provider_interface: LlmProviderType::Plano,
|
||||
model: Some("Arch-Router".to_string()),
|
||||
internal: Some(true),
|
||||
..Default::default()
|
||||
},
|
||||
LlmProvider {
|
||||
name: "plano-orchestrator".to_string(),
|
||||
provider_interface: LlmProviderType::Arch,
|
||||
provider_interface: LlmProviderType::Plano,
|
||||
model: Some("Plano-Orchestrator".to_string()),
|
||||
internal: Some(true),
|
||||
..Default::default()
|
||||
|
|
|
|||
|
|
@ -33,5 +33,4 @@ pub const OTEL_COLLECTOR_HTTP: &str = "opentelemetry_collector_http";
|
|||
pub const LLM_ROUTE_HEADER: &str = "x-arch-llm-route";
|
||||
pub const ENVOY_RETRY_HEADER: &str = "x-envoy-max-retries";
|
||||
pub const BRIGHT_STAFF_SERVICE_NAME: &str = "brightstaff";
|
||||
pub const PLANO_ORCHESTRATOR_MODEL_NAME: &str = "Plano-Orchestrator";
|
||||
pub const ARCH_FC_CLUSTER: &str = "arch";
|
||||
pub const PLANO_FC_CLUSTER: &str = "plano";
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ mod tests {
|
|||
ProviderId::Mistral
|
||||
);
|
||||
assert_eq!(ProviderId::try_from("groq").unwrap(), ProviderId::Groq);
|
||||
assert_eq!(ProviderId::try_from("arch").unwrap(), ProviderId::Arch);
|
||||
assert_eq!(ProviderId::try_from("plano").unwrap(), ProviderId::Plano);
|
||||
|
||||
// Test aliases
|
||||
assert_eq!(ProviderId::try_from("google").unwrap(), ProviderId::Gemini);
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ pub enum ProviderId {
|
|||
Gemini,
|
||||
Anthropic,
|
||||
GitHub,
|
||||
Arch,
|
||||
Plano,
|
||||
AzureOpenAI,
|
||||
XAI,
|
||||
TogetherAI,
|
||||
|
|
@ -58,7 +58,7 @@ impl TryFrom<&str> for ProviderId {
|
|||
"google" => Ok(ProviderId::Gemini), // alias
|
||||
"anthropic" => Ok(ProviderId::Anthropic),
|
||||
"github" => Ok(ProviderId::GitHub),
|
||||
"arch" => Ok(ProviderId::Arch),
|
||||
"plano" => Ok(ProviderId::Plano),
|
||||
"azure_openai" => Ok(ProviderId::AzureOpenAI),
|
||||
"xai" => Ok(ProviderId::XAI),
|
||||
"together_ai" => Ok(ProviderId::TogetherAI),
|
||||
|
|
@ -135,7 +135,7 @@ impl ProviderId {
|
|||
| ProviderId::Groq
|
||||
| ProviderId::Mistral
|
||||
| ProviderId::Deepseek
|
||||
| ProviderId::Arch
|
||||
| ProviderId::Plano
|
||||
| ProviderId::Gemini
|
||||
| ProviderId::GitHub
|
||||
| ProviderId::AzureOpenAI
|
||||
|
|
@ -153,7 +153,7 @@ impl ProviderId {
|
|||
| ProviderId::Groq
|
||||
| ProviderId::Mistral
|
||||
| ProviderId::Deepseek
|
||||
| ProviderId::Arch
|
||||
| ProviderId::Plano
|
||||
| ProviderId::Gemini
|
||||
| ProviderId::GitHub
|
||||
| ProviderId::AzureOpenAI
|
||||
|
|
@ -219,7 +219,7 @@ impl Display for ProviderId {
|
|||
ProviderId::Gemini => write!(f, "Gemini"),
|
||||
ProviderId::Anthropic => write!(f, "Anthropic"),
|
||||
ProviderId::GitHub => write!(f, "GitHub"),
|
||||
ProviderId::Arch => write!(f, "Arch"),
|
||||
ProviderId::Plano => write!(f, "Plano"),
|
||||
ProviderId::AzureOpenAI => write!(f, "azure_openai"),
|
||||
ProviderId::XAI => write!(f, "xai"),
|
||||
ProviderId::TogetherAI => write!(f, "together_ai"),
|
||||
|
|
|
|||
|
|
@ -873,7 +873,7 @@ impl HttpContext for StreamContext {
|
|||
// ensure that the provider has an endpoint if the access key is missing else return a bad request
|
||||
if self.llm_provider.as_ref().unwrap().endpoint.is_none()
|
||||
&& self.llm_provider.as_ref().unwrap().provider_interface
|
||||
!= LlmProviderType::Arch
|
||||
!= LlmProviderType::Plano
|
||||
{
|
||||
self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -123,6 +123,42 @@ Each agent:
|
|||
|
||||
Both agents run as native local processes and communicate with Plano running natively on the host.
|
||||
|
||||
## Running with local Plano-Orchestrator (via vLLM)
|
||||
|
||||
By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model locally using vLLM on a server with an NVIDIA GPU:
|
||||
|
||||
1. Install vLLM and download the model:
|
||||
```bash
|
||||
pip install vllm
|
||||
```
|
||||
|
||||
2. Start the vLLM server with the 4B model:
|
||||
```bash
|
||||
vllm serve katanemo/Plano-Orchestrator-4B \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--tensor-parallel-size 1 \
|
||||
--gpu-memory-utilization 0.3 \
|
||||
--tokenizer katanemo/Plano-Orchestrator-4B \
|
||||
--chat-template chat_template.jinja \
|
||||
--served-model-name katanemo/Plano-Orchestrator-4B \
|
||||
--enable-prefix-caching
|
||||
```
|
||||
|
||||
3. Start the demo with the local orchestrator config:
|
||||
```bash
|
||||
./run_demo.sh --local-orchestrator
|
||||
```
|
||||
|
||||
4. Test with curl:
|
||||
```bash
|
||||
curl -X POST http://localhost:8001/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model": "gpt-5.2", "messages": [{"role": "user", "content": "What is the weather in Istanbul?"}]}'
|
||||
```
|
||||
|
||||
You should see Plano use your local orchestrator to route the request to the weather agent.
|
||||
|
||||
## Observability
|
||||
|
||||
This demo includes full OpenTelemetry (OTel) compatible distributed tracing to monitor and debug agent interactions:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,66 @@
|
|||
version: v0.3.0
|
||||
|
||||
overrides:
|
||||
agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B
|
||||
|
||||
agents:
|
||||
- id: weather_agent
|
||||
url: http://localhost:10510
|
||||
- id: flight_agent
|
||||
url: http://localhost:10520
|
||||
|
||||
model_providers:
|
||||
- model: plano/katanemo/Plano-Orchestrator-4B
|
||||
base_url: http://localhost:8000
|
||||
|
||||
- model: openai/gpt-5.2
|
||||
access_key: $OPENAI_API_KEY
|
||||
default: true
|
||||
- model: openai/gpt-4o-mini
|
||||
access_key: $OPENAI_API_KEY # smaller, faster, cheaper model for extracting entities like location
|
||||
|
||||
listeners:
|
||||
- type: agent
|
||||
name: travel_booking_service
|
||||
port: 8001
|
||||
router: plano_orchestrator_v1
|
||||
agents:
|
||||
- id: weather_agent
|
||||
description: |
|
||||
|
||||
WeatherAgent is a specialized AI assistant for real-time weather information and forecasts. It provides accurate weather data for any city worldwide using the Open-Meteo API, helping travelers plan their trips with up-to-date weather conditions.
|
||||
|
||||
Capabilities:
|
||||
* Get real-time weather conditions and multi-day forecasts for any city worldwide using Open-Meteo API (free, no API key needed)
|
||||
* Provides current temperature
|
||||
* Provides multi-day forecasts
|
||||
* Provides weather conditions
|
||||
* Provides sunrise/sunset times
|
||||
* Provides detailed weather information
|
||||
* Understands conversation context to resolve location references from previous messages
|
||||
* Handles weather-related questions including "What's the weather in [city]?", "What's the forecast for [city]?", "How's the weather in [city]?"
|
||||
* When queries include both weather and other travel questions (e.g., flights, currency), this agent answers ONLY the weather part
|
||||
|
||||
- id: flight_agent
|
||||
description: |
|
||||
|
||||
FlightAgent is an AI-powered tool specialized in providing live flight information between airports. It leverages the FlightAware AeroAPI to deliver real-time flight status, gate information, and delay updates.
|
||||
|
||||
Capabilities:
|
||||
* Get live flight information between airports using FlightAware AeroAPI
|
||||
* Shows real-time flight status
|
||||
* Shows scheduled/estimated/actual departure and arrival times
|
||||
* Shows gate and terminal information
|
||||
* Shows delays
|
||||
* Shows aircraft type
|
||||
* Shows flight status
|
||||
* Automatically resolves city names to airport codes (IATA/ICAO)
|
||||
* Understands conversation context to infer origin/destination from follow-up questions
|
||||
* Handles flight-related questions including "What flights go from [city] to [city]?", "Do flights go to [city]?", "Are there direct flights from [city]?"
|
||||
* When queries include both flight and other travel questions (e.g., weather, currency), this agent answers ONLY the flight part
|
||||
|
||||
tracing:
|
||||
random_sampling: 100
|
||||
span_attributes:
|
||||
header_prefixes:
|
||||
- x-acme-
|
||||
|
|
@ -31,8 +31,13 @@ start_demo() {
|
|||
fi
|
||||
|
||||
# Step 4: Start Plano
|
||||
echo "Starting Plano with config.yaml..."
|
||||
planoai up config.yaml
|
||||
PLANO_CONFIG="config.yaml"
|
||||
if [ "$1" == "--local-orchestrator" ]; then
|
||||
PLANO_CONFIG="config_local_orchestrator.yaml"
|
||||
echo "Using local orchestrator config..."
|
||||
fi
|
||||
echo "Starting Plano with $PLANO_CONFIG..."
|
||||
planoai up "$PLANO_CONFIG"
|
||||
|
||||
# Step 5: Start agents natively
|
||||
echo "Starting agents..."
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
version: v0.1.0
|
||||
|
||||
routing:
|
||||
model: Arch-Router
|
||||
llm_provider: arch-router
|
||||
overrides:
|
||||
llm_routing_model: Arch-Router
|
||||
|
||||
listeners:
|
||||
egress_traffic:
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
version: v0.3.0
|
||||
|
||||
routing:
|
||||
model: Arch-Router
|
||||
llm_provider: arch-router
|
||||
overrides:
|
||||
llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
|
||||
|
||||
listeners:
|
||||
- type: model
|
||||
|
|
@ -11,8 +10,7 @@ listeners:
|
|||
|
||||
model_providers:
|
||||
|
||||
- name: arch-router
|
||||
model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
|
||||
- model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
|
||||
base_url: http://localhost:11434
|
||||
|
||||
- model: openai/gpt-4o-mini
|
||||
|
|
|
|||
|
|
@ -253,13 +253,11 @@ Using Ollama (recommended for local development)
|
|||
|
||||
.. code-block:: yaml
|
||||
|
||||
routing:
|
||||
model: Arch-Router
|
||||
llm_provider: arch-router
|
||||
overrides:
|
||||
llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
|
||||
|
||||
model_providers:
|
||||
- name: arch-router
|
||||
model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
|
||||
- model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
|
||||
base_url: http://localhost:11434
|
||||
|
||||
- model: openai/gpt-5.2
|
||||
|
|
@ -324,13 +322,11 @@ vLLM provides higher throughput and GPU optimizations suitable for production de
|
|||
|
||||
.. code-block:: yaml
|
||||
|
||||
routing:
|
||||
model: Arch-Router
|
||||
llm_provider: arch-router
|
||||
overrides:
|
||||
llm_routing_model: plano/Arch-Router
|
||||
|
||||
model_providers:
|
||||
- name: arch-router
|
||||
model: Arch-Router
|
||||
- model: plano/Arch-Router
|
||||
base_url: http://<your-server-ip>:10000
|
||||
|
||||
- model: openai/gpt-5.2
|
||||
|
|
|
|||
|
|
@ -335,6 +335,90 @@ Combine RAG agents for documentation lookup with specialized troubleshooting age
|
|||
- id: troubleshoot_agent
|
||||
description: Diagnoses and resolves technical issues step by step
|
||||
|
||||
Self-hosting Plano-Orchestrator
|
||||
-------------------------------
|
||||
|
||||
By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model, you can serve it using **vLLM** on a server with an NVIDIA GPU.
|
||||
|
||||
.. note::
|
||||
vLLM requires a Linux server with an NVIDIA GPU (CUDA). For local development on macOS, a GGUF version for Ollama is coming soon.
|
||||
|
||||
The following model variants are available on HuggingFace:
|
||||
|
||||
* `Plano-Orchestrator-4B <https://huggingface.co/katanemo/Plano-Orchestrator-4B>`_ — lighter model, suitable for development and testing
|
||||
* `Plano-Orchestrator-4B-FP8 <https://huggingface.co/katanemo/Plano-Orchestrator-4B-FP8>`_ — FP8 quantized 4B model, lower memory usage
|
||||
* `Plano-Orchestrator-30B-A3B <https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B>`_ — full-size model for production
|
||||
* `Plano-Orchestrator-30B-A3B-FP8 <https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B-FP8>`_ — FP8 quantized 30B model, recommended for production deployments
|
||||
|
||||
Using vLLM
|
||||
~~~~~~~~~~
|
||||
|
||||
1. **Install vLLM**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install vllm
|
||||
|
||||
2. **Download the model and chat template**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install huggingface_hub
|
||||
huggingface-cli download katanemo/Plano-Orchestrator-4B
|
||||
|
||||
3. **Start the vLLM server**
|
||||
|
||||
For the 4B model (development):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
vllm serve katanemo/Plano-Orchestrator-4B \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--tensor-parallel-size 1 \
|
||||
--gpu-memory-utilization 0.3 \
|
||||
--tokenizer katanemo/Plano-Orchestrator-4B \
|
||||
--chat-template chat_template.jinja \
|
||||
--served-model-name katanemo/Plano-Orchestrator-4B \
|
||||
--enable-prefix-caching
|
||||
|
||||
For the 30B-A3B-FP8 model (production):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
vllm serve katanemo/Plano-Orchestrator-30B-A3B-FP8 \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--tensor-parallel-size 1 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--tokenizer katanemo/Plano-Orchestrator-30B-A3B-FP8 \
|
||||
--chat-template chat_template.jinja \
|
||||
--max-model-len 32768 \
|
||||
--served-model-name katanemo/Plano-Orchestrator-30B-A3B-FP8 \
|
||||
--enable-prefix-caching
|
||||
|
||||
4. **Configure Plano to use the local orchestrator**
|
||||
|
||||
Use the model name matching your ``--served-model-name``:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
overrides:
|
||||
agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B
|
||||
|
||||
model_providers:
|
||||
- model: katanemo/Plano-Orchestrator-4B
|
||||
provider_interface: plano
|
||||
base_url: http://<your-server-ip>:8000
|
||||
|
||||
5. **Verify the server is running**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
curl http://localhost:8000/health
|
||||
curl http://localhost:8000/v1/models
|
||||
|
||||
|
||||
Next Steps
|
||||
----------
|
||||
|
||||
|
|
|
|||
|
|
@ -107,11 +107,11 @@ model_providers:
|
|||
- internal: true
|
||||
model: Arch-Function
|
||||
name: arch-function
|
||||
provider_interface: arch
|
||||
provider_interface: plano
|
||||
- internal: true
|
||||
model: Plano-Orchestrator
|
||||
name: plano-orchestrator
|
||||
provider_interface: arch
|
||||
name: plano/orchestrator
|
||||
provider_interface: plano
|
||||
prompt_targets:
|
||||
- description: Get current weather at a location.
|
||||
endpoint:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue