mirror of
https://github.com/katanemo/plano.git
synced 2026-04-25 00:36:34 +02:00
add dynamic log config
This commit is contained in:
parent
f019f05738
commit
f04c0b7cdd
9 changed files with 634 additions and 5 deletions
|
|
@ -74,6 +74,7 @@ def docker_start_plano_detached(
|
|||
port_mappings = [
|
||||
"12001:12001",
|
||||
"19901:9901",
|
||||
"19091:9091",
|
||||
]
|
||||
|
||||
for port in gateway_ports:
|
||||
|
|
|
|||
|
|
@ -593,6 +593,82 @@ def cli_agent(type, file, path, settings):
|
|||
sys.exit(1)
|
||||
|
||||
|
||||
@click.command("log-level")
|
||||
@click.argument("level", required=False)
|
||||
@click.option(
|
||||
"--show",
|
||||
is_flag=True,
|
||||
help="Show current log levels for both brightstaff and Envoy.",
|
||||
)
|
||||
@click.option(
|
||||
"--docker",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Target a Docker-based Plano instance.",
|
||||
)
|
||||
def log_level(level, show, docker):
|
||||
"""Dynamically change the log level for a running Plano instance.
|
||||
|
||||
Sets the log level for both the brightstaff service and Envoy proxy.
|
||||
LEVEL accepts standard log levels (trace, debug, info, warn, error) or
|
||||
RUST_LOG-style filters (e.g. 'brightstaff=debug,info').
|
||||
"""
|
||||
import requests as req
|
||||
|
||||
brightstaff_port = 19091 if docker else 9091
|
||||
envoy_admin_port = 19901 if docker else 9901
|
||||
brightstaff_url = f"http://localhost:{brightstaff_port}/admin/log-level"
|
||||
envoy_url = f"http://localhost:{envoy_admin_port}/logging"
|
||||
|
||||
console = _console()
|
||||
|
||||
if show or not level:
|
||||
# Show current log levels
|
||||
try:
|
||||
resp = req.get(brightstaff_url, timeout=3)
|
||||
data = resp.json()
|
||||
console.print(
|
||||
f" brightstaff: [bold]{data.get('level', 'unknown')}[/bold]"
|
||||
)
|
||||
except Exception:
|
||||
console.print(" brightstaff: [dim]unavailable[/dim]")
|
||||
|
||||
try:
|
||||
resp = req.get(envoy_url, timeout=3)
|
||||
console.print(f" envoy: [dim](see {envoy_url} for per-logger levels)[/dim]")
|
||||
except Exception:
|
||||
console.print(" envoy: [dim]unavailable[/dim]")
|
||||
return
|
||||
|
||||
# Set log level on both services
|
||||
errors = []
|
||||
try:
|
||||
resp = req.put(brightstaff_url, data=level, timeout=3)
|
||||
if resp.status_code == 200:
|
||||
console.print(f" brightstaff → [bold]{level}[/bold]")
|
||||
else:
|
||||
err = resp.json().get("error", resp.text)
|
||||
console.print(f" brightstaff: [red]error[/red] — {err}")
|
||||
errors.append("brightstaff")
|
||||
except Exception as e:
|
||||
console.print(f" brightstaff: [red]unavailable[/red] — {e}")
|
||||
errors.append("brightstaff")
|
||||
|
||||
try:
|
||||
resp = req.post(f"{envoy_url}?level={level}", timeout=3)
|
||||
if resp.status_code == 200:
|
||||
console.print(f" envoy → [bold]{level}[/bold]")
|
||||
else:
|
||||
console.print(f" envoy: [red]error[/red] — {resp.text.strip()}")
|
||||
errors.append("envoy")
|
||||
except Exception as e:
|
||||
console.print(f" envoy: [red]unavailable[/red] — {e}")
|
||||
errors.append("envoy")
|
||||
|
||||
if errors:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# add commands to the main group
|
||||
main.add_command(up)
|
||||
main.add_command(down)
|
||||
|
|
@ -602,6 +678,7 @@ main.add_command(cli_agent)
|
|||
main.add_command(generate_prompt_targets)
|
||||
main.add_command(init_cmd, name="init")
|
||||
main.add_command(trace_cmd, name="trace")
|
||||
main.add_command(log_level)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ use brightstaff::router::orchestrator::OrchestratorService;
|
|||
use brightstaff::state::memory::MemoryConversationalStorage;
|
||||
use brightstaff::state::postgresql::PostgreSQLConversationStorage;
|
||||
use brightstaff::state::StateStorage;
|
||||
use brightstaff::tracing::init_tracer;
|
||||
use brightstaff::tracing::{get_log_level, init_tracer, set_log_level};
|
||||
use bytes::Bytes;
|
||||
use common::configuration::{
|
||||
Agent, Configuration, FilterPipeline, ListenerType, ResolvedFilterChain,
|
||||
|
|
@ -384,6 +384,59 @@ async fn init_state_storage(
|
|||
Ok(Some(storage))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Admin handlers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
use http_body_util::BodyExt;
|
||||
|
||||
fn json_response(
|
||||
status: StatusCode,
|
||||
body: &str,
|
||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
|
||||
let bytes = Bytes::from(body.to_string());
|
||||
let body = http_body_util::Full::new(bytes)
|
||||
.map_err(|never| match never {})
|
||||
.boxed();
|
||||
let mut resp = Response::new(body);
|
||||
*resp.status_mut() = status;
|
||||
resp.headers_mut()
|
||||
.insert("Content-Type", HeaderValue::from_static("application/json"));
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
async fn handle_get_log_level() -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
|
||||
match get_log_level() {
|
||||
Some(level) => json_response(StatusCode::OK, &format!("{{\"level\":\"{level}\"}}")),
|
||||
None => json_response(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
"{\"error\":\"tracer not initialized\"}",
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_set_log_level(
|
||||
req: Request<Incoming>,
|
||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
|
||||
let body = req.collect().await.map_err(|_| ()).unwrap();
|
||||
let new_level = String::from_utf8_lossy(&body.to_bytes()).trim().to_string();
|
||||
|
||||
if new_level.is_empty() {
|
||||
return json_response(
|
||||
StatusCode::BAD_REQUEST,
|
||||
"{\"error\":\"body must contain a log level filter, e.g. 'debug'\"}",
|
||||
);
|
||||
}
|
||||
|
||||
match set_log_level(&new_level) {
|
||||
Ok(()) => {
|
||||
info!(level = %new_level, "log level updated");
|
||||
json_response(StatusCode::OK, &format!("{{\"level\":\"{new_level}\"}}"))
|
||||
}
|
||||
Err(e) => json_response(StatusCode::BAD_REQUEST, &format!("{{\"error\":\"{e}\"}}")),
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Request routing
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -426,6 +479,13 @@ async fn route(
|
|||
}
|
||||
}
|
||||
|
||||
// --- Admin routes ---
|
||||
match (req.method(), path.as_str()) {
|
||||
(&Method::GET, "/admin/log-level") => return handle_get_log_level().await,
|
||||
(&Method::PUT, "/admin/log-level") => return handle_set_log_level(req).await,
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// --- Standard routes ---
|
||||
match (req.method(), path.as_str()) {
|
||||
(&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH) => {
|
||||
|
|
|
|||
|
|
@ -8,12 +8,38 @@ use tracing::{Event, Subscriber};
|
|||
use tracing_subscriber::fmt::{format, time::FormatTime, FmtContext, FormatEvent, FormatFields};
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::registry::LookupSpan;
|
||||
use tracing_subscriber::reload;
|
||||
use tracing_subscriber::util::SubscriberInitExt;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
use super::ServiceNameOverrideExporter;
|
||||
use common::configuration::Tracing;
|
||||
|
||||
type ReloadHandle = reload::Handle<EnvFilter, tracing_subscriber::Registry>;
|
||||
|
||||
static LOG_LEVEL_HANDLE: OnceLock<ReloadHandle> = OnceLock::new();
|
||||
|
||||
/// Dynamically change the log level filter at runtime.
|
||||
///
|
||||
/// Accepts any valid `RUST_LOG` / `EnvFilter` syntax, e.g. `"debug"`,
|
||||
/// `"brightstaff=trace,info"`.
|
||||
pub fn set_log_level(new_filter: &str) -> Result<(), String> {
|
||||
let handle = LOG_LEVEL_HANDLE
|
||||
.get()
|
||||
.ok_or_else(|| "tracer not initialized".to_string())?;
|
||||
let filter = EnvFilter::try_new(new_filter)
|
||||
.map_err(|e| format!("invalid filter '{new_filter}': {e}"))?;
|
||||
handle
|
||||
.reload(filter)
|
||||
.map_err(|e| format!("failed to reload filter: {e}"))
|
||||
}
|
||||
|
||||
/// Returns the current log level filter string, if the tracer is initialized.
|
||||
pub fn get_log_level() -> Option<String> {
|
||||
let handle = LOG_LEVEL_HANDLE.get()?;
|
||||
handle.with_current(|f| f.to_string()).ok()
|
||||
}
|
||||
|
||||
struct BracketedTime;
|
||||
|
||||
impl FormatTime for BracketedTime {
|
||||
|
|
@ -118,9 +144,10 @@ pub fn init_tracer(tracing_config: Option<&Tracing>) -> &'static SdkTracerProvid
|
|||
let telemetry_layer =
|
||||
tracing_opentelemetry::layer().with_tracer(provider.tracer("brightstaff"));
|
||||
|
||||
// Combine the OpenTelemetry layer with fmt layer using the registry
|
||||
let env_filter =
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"));
|
||||
let (filter_layer, reload_handle) = reload::Layer::new(env_filter);
|
||||
LOG_LEVEL_HANDLE.set(reload_handle).ok();
|
||||
|
||||
// Create fmt layer with span field formatting enabled (no ANSI to keep fields parseable)
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
|
|
@ -129,8 +156,8 @@ pub fn init_tracer(tracing_config: Option<&Tracing>) -> &'static SdkTracerProvid
|
|||
.with_ansi(false);
|
||||
|
||||
let subscriber = tracing_subscriber::registry()
|
||||
.with(filter_layer)
|
||||
.with(telemetry_layer)
|
||||
.with(env_filter)
|
||||
.with(fmt_layer);
|
||||
|
||||
tracing::subscriber::set_global_default(subscriber)
|
||||
|
|
@ -144,6 +171,8 @@ pub fn init_tracer(tracing_config: Option<&Tracing>) -> &'static SdkTracerProvid
|
|||
|
||||
let env_filter =
|
||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"));
|
||||
let (filter_layer, reload_handle) = reload::Layer::new(env_filter);
|
||||
LOG_LEVEL_HANDLE.set(reload_handle).ok();
|
||||
|
||||
// Create fmt layer with span field formatting enabled (no ANSI to keep fields parseable)
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
|
|
@ -152,7 +181,7 @@ pub fn init_tracer(tracing_config: Option<&Tracing>) -> &'static SdkTracerProvid
|
|||
.with_ansi(false);
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(env_filter)
|
||||
.with(filter_layer)
|
||||
.with(fmt_layer)
|
||||
.init();
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ pub use constants::{
|
|||
error, http, llm, operation_component, routing, signals, OperationNameBuilder,
|
||||
};
|
||||
pub use custom_attributes::collect_custom_trace_attributes;
|
||||
pub use init::init_tracer;
|
||||
pub use init::{get_log_level, init_tracer, set_log_level};
|
||||
pub use service_name_exporter::{ServiceNameOverrideExporter, SERVICE_NAME_OVERRIDE_KEY};
|
||||
|
||||
use opentelemetry::trace::get_active_span;
|
||||
|
|
|
|||
27
demos/llm_routing/model_routing_service/config_metrics.yaml
Normal file
27
demos/llm_routing/model_routing_service/config_metrics.yaml
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
|
||||
version: v0.4.0
|
||||
|
||||
overrides:
|
||||
llm_routing_model: plano/Plano-Orchestrator
|
||||
agent_orchestration_model: plano/Plano-Orchestrator
|
||||
|
||||
listeners:
|
||||
- type: model
|
||||
name: model_listener
|
||||
port: 12000
|
||||
|
||||
model_providers:
|
||||
- model: plano/Plano-Orchestrator
|
||||
base_url: http://plano-orchestrator:10001
|
||||
passthrough_auth: true
|
||||
|
||||
model_metrics_sources:
|
||||
- type: cost
|
||||
provider: digitalocean
|
||||
refresh_interval: 3600
|
||||
|
||||
- type: latency
|
||||
provider: prometheus
|
||||
url: http://metrics-kube-prometheus-st-prometheus:9090
|
||||
query: histogram_quantile(0.95, sum by (model_name, le) (rate(inference_proxy_inference_client_ttft_duration_bucket[5m])))
|
||||
refresh_interval: 300
|
||||
358
docs/blogs/model-routing-service.md
Normal file
358
docs/blogs/model-routing-service.md
Normal file
|
|
@ -0,0 +1,358 @@
|
|||
# Intent-Aware LLM Routing at Infrastructure Speed: How We Built a Purpose-Built 1.5B Router
|
||||
|
||||
*Adil Hafeez | April 2026*
|
||||
|
||||
---
|
||||
|
||||
Every team running multi-model LLM infrastructure eventually hits the same problem: you have five providers, each with different cost and latency profiles, and the right model for a coding question is not the right model for a summarization task. How do you route each request to the best model — without adding seconds of latency or dollars of cost to every call?
|
||||
|
||||
We built [Plano](https://github.com/katanemo/plano), an open-source AI-native proxy built on Envoy, to solve this at the infrastructure layer. This post is a deep dive into one specific piece: the **Model Routing Service** — how we use a purpose-built 1.5B parameter model to classify user intent in ~50ms and rank candidate models using live cost and latency data.
|
||||
|
||||
## The Routing Problem
|
||||
|
||||
When your application talks to multiple LLM providers, you need a routing decision on every request. Teams typically reach for one of three approaches, and each breaks down in a predictable way.
|
||||
|
||||
**Keyword and regex matching** is the first instinct. Match "write a function" to the code model, "explain" to the chat model. It's fast — effectively zero latency — but brittle. "Can you code this up?" doesn't match "write a function," and maintenance cost scales linearly with your vocabulary. Every new phrasing requires a new rule.
|
||||
|
||||
**Using a frontier model as a classifier** is the next step. Send the user's message to GPT-4 or Claude with a system prompt like "classify this as code_generation or general_question." It works well — frontier models are excellent at intent classification. But you're spending $0.01–0.03 and 500ms–2s per classification call. You're paying a frontier model just to decide which frontier model to call.
|
||||
|
||||
**Static rules and load balancing** ignore semantics entirely. Round-robin across a model pool, or route by endpoint path. A complex reasoning question and a simple chat message hit the same model. You're either overpaying (sending everything to the expensive model) or underserving (sending everything to the cheap one).
|
||||
|
||||
The gap is clear: we needed something that understands intent like a frontier model but runs at infrastructure speed and infrastructure cost.
|
||||
|
||||
## Arch-Router: A Purpose-Built 1.5B Classification Model
|
||||
|
||||
Rather than repurposing a general-purpose LLM, we trained a dedicated model for one job: given a conversation and a set of route descriptions, return the name of the best-matching route.
|
||||
|
||||
[Arch-Router](https://huggingface.co/katanemo/Arch-Router-1.5B) is a 1.5B parameter model fine-tuned specifically for routing classification. It's not a chat model — it doesn't generate prose, explain its reasoning, or handle follow-up questions. It reads a conversation, compares it against route descriptions, and emits a JSON object: `{"route": "code_generation"}` or `{"route": "other"}` if nothing matches.
|
||||
|
||||
**Why 1.5B parameters?** We evaluated models across three orders of magnitude. At 125M parameters, accuracy drops sharply on ambiguous queries — "help me with this code" could be generation or debugging, and smaller models can't reliably distinguish based on conversational context. At 7B+ parameters, accuracy improves marginally (<2% on our benchmark) but latency doubles and GPU memory requirements triple. 1.5B is the inflection point: accurate enough for production routing, small enough to run on a single GPU with 30% memory utilization.
|
||||
|
||||
For deployment, we quantize to **Q4_K_M GGUF format**, which keeps GPU memory at ~2GB and enables serving via [vLLM](https://github.com/vllm-project/vllm) with prefix caching enabled. The quantized model maintains classification accuracy within 1% of the full-precision version on our routing benchmark.
|
||||
|
||||
### How the Prompt Works
|
||||
|
||||
The system prompt uses XML-tagged route descriptions — a deliberate choice over JSON because small models handle XML boundary tokens more reliably:
|
||||
|
||||
```
|
||||
You are a helpful assistant designed to find the best suited route.
|
||||
You are provided with route description within <routes></routes> XML tags:
|
||||
<routes>
|
||||
{routes}
|
||||
</routes>
|
||||
|
||||
<conversation>
|
||||
{conversation}
|
||||
</conversation>
|
||||
|
||||
Your task is to decide which route is best suit with user intent on the
|
||||
conversation in <conversation></conversation> XML tags. Follow the instruction:
|
||||
1. If the latest intent from user is irrelevant or user intent is full filled,
|
||||
response with other route {"route": "other"}.
|
||||
2. You must analyze the route descriptions and find the best match route for
|
||||
user latest intent.
|
||||
3. You only response the name of the route that best matches the user's request,
|
||||
use the exact name in the <routes></routes>.
|
||||
|
||||
Based on your analysis, provide your response in the following JSON formats if
|
||||
you decide to match any route:
|
||||
{"route": "route_name"}
|
||||
```
|
||||
|
||||
The `{routes}` placeholder is populated from the YAML configuration — each route has a name and a natural-language description. The `{conversation}` placeholder gets the user's messages, with system messages and tool calls filtered out to focus on user intent. We cap input at 2048 tokens; routing decisions should be based on recent context, not entire conversation histories.
|
||||
|
||||
This is binary classification per route, not N-way. The model evaluates each route description against the conversation and picks the best match. If nothing fits, it returns `"other"` and the request falls through to the default model.
|
||||
|
||||
We also trained a variant called **Plano-Orchestrator** for multi-agent scenarios, where the model returns an array of matching routes: `{"route": ["research_agent", "code_agent"]}`. Same architecture, different training objective.
|
||||
|
||||
## The Ranking Engine: Live Cost and Latency Data
|
||||
|
||||
Knowing the right *route* is only half the problem. Within a route, you might have three candidate models — and the best one depends on whether you're optimizing for cost or latency right now. Static ordering doesn't cut it because model pricing changes, latency drifts with load, and rate limits shift availability.
|
||||
|
||||
Plano's `ModelMetricsService` continuously fetches cost and latency data from external sources, then ranks candidate models at request time.
|
||||
|
||||
The core ranking function is straightforward:
|
||||
|
||||
```rust
|
||||
pub async fn rank_models(&self, models: &[String], policy: &SelectionPolicy) -> Vec<String> {
|
||||
match policy.prefer {
|
||||
SelectionPreference::Cheapest => {
|
||||
let data = self.cost.read().await;
|
||||
rank_by_ascending_metric(models, &data)
|
||||
}
|
||||
SelectionPreference::Fastest => {
|
||||
let data = self.latency.read().await;
|
||||
rank_by_ascending_metric(models, &data)
|
||||
}
|
||||
SelectionPreference::Random => shuffle(models),
|
||||
SelectionPreference::None => models.to_vec(),
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Models with no metric data get appended last — they're still available as fallback but won't be preferred. The system logs a warning both at startup and per-request when a model has no data, so you can catch misconfigurations early.
|
||||
|
||||
### Metrics Sources
|
||||
|
||||
**Cost data** is fetched from DigitalOcean's public Gen-AI pricing API, which requires no authentication and returns input/output pricing per million tokens for all models in the catalog. We compute a single cost scalar as `input_price_per_million + output_price_per_million` — only relative ordering matters, not absolute numbers.
|
||||
|
||||
**Latency data** comes from Prometheus. You provide a PromQL query that returns an instant vector with a `model_name` label — typically a P95 histogram quantile over your actual traffic. The system re-fetches on a configurable interval (default: 60s for latency, 3600s for cost).
|
||||
|
||||
A `model_aliases` map bridges naming differences. DigitalOcean's catalog uses `openai-gpt-4o`; your config might use `openai/gpt-4o`. The alias map handles this without changing your routing configuration.
|
||||
|
||||
### Fail-Fast Validation
|
||||
|
||||
Plano validates metric source configuration at startup and exits with a clear error if the setup is inconsistent:
|
||||
|
||||
| Condition | Error |
|
||||
|---|---|
|
||||
| `prefer: cheapest` with no cost source | `requires a cost metrics source` |
|
||||
| `prefer: fastest` with no latency source | `requires a latency metrics source` |
|
||||
|
||||
This is a deliberate design choice. Misconfigured routing that silently falls back to default ordering is worse than a startup crash — you'd spend hours debugging why your "cheapest" policy is serving GPT-4o before GPT-4o-mini.
|
||||
|
||||
## Architecture: Why Envoy, WASM, and Async Rust
|
||||
|
||||
The routing service doesn't exist in isolation. It runs inside Plano's three-layer architecture, and the choice of each layer directly affects routing performance.
|
||||
|
||||
```
|
||||
Client ──► Envoy (llm_gateway.wasm) ──► Brightstaff ──► LLM Providers
|
||||
│
|
||||
Arch-Router (1.5B)
|
||||
Metrics Service
|
||||
```
|
||||
|
||||
### Layer 1: Envoy as Transport Substrate
|
||||
|
||||
We don't implement TLS, connection pooling, retries, circuit breaking, or HTTP/2 multiplexing. Envoy does all of this, battle-tested across deployments at Google, Lyft, and thousands of other production environments. Building a custom HTTP server to handle LLM traffic would mean reimplementing solved infrastructure problems — and getting them wrong in subtle ways under load.
|
||||
|
||||
Envoy's threading model matters here: one event-loop worker per CPU core, each connection pinned to a single worker. There's no lock contention in the hot path. For streaming LLM responses — which are long-lived, chunked HTTP connections — this model scales naturally. We're building on Envoy because we were early contributors to the project and understand its extension points deeply.
|
||||
|
||||
### Layer 2: LLM Gateway (WASM Plugin)
|
||||
|
||||
The `llm_gateway.wasm` filter runs inside Envoy's process — not as a sidecar, not as a separate service. It handles format translation between providers (OpenAI, Anthropic, Gemini, Mistral, Groq, DeepSeek, xAI, Bedrock) at wire speed with zero network hop.
|
||||
|
||||
The WASM sandbox imposes a strict constraint: **no std networking, no tokio, no async runtime**. Everything is `dispatch_http_call()` with a callback. All dependencies must be `no_std`-compatible. This is painful to develop against, but it produces a cleaner separation between I/O and logic — and the resulting binary is tiny (single-digit MBs) with a predictable memory footprint.
|
||||
|
||||
The format translation layer is powered by `hermesllm`, our Rust crate for LLM API abstraction. Adding a new provider means implementing `ProviderRequest` and `ProviderResponse` traits — the router and gateway don't need to change.
|
||||
|
||||
### Layer 3: Brightstaff (Native Async Rust)
|
||||
|
||||
The routing logic — `RouterService`, `ModelMetricsService`, OTEL tracing — lives in Brightstaff, a native Rust binary running on the Tokio async runtime alongside Envoy. One lightweight Tokio task per request, not one OS thread. This handles thousands of concurrent routing decisions on modest hardware.
|
||||
|
||||
**Why Rust?** In a proxy that handles streaming LLM responses, garbage collector pauses cause visible stutter in token delivery. Go's GC pause (typically 0.1-1ms) is fine for most applications but noticeable in a token stream delivering chunks every 20-50ms. Rust's ownership model eliminates this class of bugs entirely — no GC, no pauses, predictable latency.
|
||||
|
||||
## Running the Model Routing Service
|
||||
|
||||
Here's the complete setup from our [demo](https://github.com/katanemo/plano/tree/main/demos/llm_routing/model_routing_service). The config defines two routes with different ranking strategies and two metrics sources:
|
||||
|
||||
```yaml
|
||||
version: v0.4.0
|
||||
|
||||
listeners:
|
||||
- type: model
|
||||
name: model_listener
|
||||
port: 12000
|
||||
|
||||
model_providers:
|
||||
- model: openai/gpt-4o-mini
|
||||
access_key: $OPENAI_API_KEY
|
||||
default: true
|
||||
- model: openai/gpt-4o
|
||||
access_key: $OPENAI_API_KEY
|
||||
- model: anthropic/claude-sonnet-4-20250514
|
||||
access_key: $ANTHROPIC_API_KEY
|
||||
|
||||
routing_preferences:
|
||||
- name: complex_reasoning
|
||||
description: complex reasoning tasks, multi-step analysis, or detailed explanations
|
||||
models:
|
||||
- openai/gpt-4o
|
||||
- openai/gpt-4o-mini
|
||||
selection_policy:
|
||||
prefer: cheapest
|
||||
|
||||
- name: code_generation
|
||||
description: generating new code, writing functions, or creating boilerplate
|
||||
models:
|
||||
- anthropic/claude-sonnet-4-20250514
|
||||
- openai/gpt-4o
|
||||
selection_policy:
|
||||
prefer: fastest
|
||||
|
||||
model_metrics_sources:
|
||||
- type: cost
|
||||
provider: digitalocean
|
||||
refresh_interval: 3600
|
||||
model_aliases:
|
||||
openai-gpt-4o: openai/gpt-4o
|
||||
openai-gpt-4o-mini: openai/gpt-4o-mini
|
||||
anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
|
||||
|
||||
- type: latency
|
||||
provider: prometheus
|
||||
url: http://localhost:9090
|
||||
query: model_latency_p95_seconds
|
||||
refresh_interval: 60
|
||||
```
|
||||
|
||||
Start the metrics infrastructure and Plano:
|
||||
|
||||
```bash
|
||||
# Start Prometheus + mock metrics server
|
||||
docker compose up -d
|
||||
|
||||
# Start Plano
|
||||
planoai up config.yaml
|
||||
```
|
||||
|
||||
### Code Generation: Ranked by Latency
|
||||
|
||||
A coding request hits the `code_generation` route. With `prefer: fastest`, the metrics service checks P95 latencies from Prometheus — Claude-Sonnet at 0.85s beats GPT-4o at 1.20s:
|
||||
|
||||
```bash
|
||||
curl -s http://localhost:12000/routing/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Write a Python function that implements binary search on a sorted array"}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"],
|
||||
"route": "code_generation",
|
||||
"trace_id": "c16d1096c1af4a17abb48fb182918a88"
|
||||
}
|
||||
```
|
||||
|
||||
### Complex Reasoning: Ranked by Cost
|
||||
|
||||
A reasoning request hits `complex_reasoning` with `prefer: cheapest`. DigitalOcean pricing puts GPT-4o-mini ($0.75/M tokens) well ahead of GPT-4o ($25/M):
|
||||
|
||||
```bash
|
||||
curl -s http://localhost:12000/routing/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Explain the trade-offs between microservices and monolithic architectures, considering scalability, team structure, and operational complexity"}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"models": ["openai/gpt-4o-mini", "openai/gpt-4o"],
|
||||
"route": "complex_reasoning",
|
||||
"trace_id": "..."
|
||||
}
|
||||
```
|
||||
|
||||
### Per-Request Overrides
|
||||
|
||||
Config-level preferences set the default, but individual requests can override them with an inline `routing_preferences` field. This is stripped from the request before forwarding upstream — downstream providers never see it:
|
||||
|
||||
```bash
|
||||
curl -s http://localhost:12000/routing/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Summarize the key differences between TCP and UDP"}
|
||||
],
|
||||
"routing_preferences": [
|
||||
{
|
||||
"name": "general",
|
||||
"description": "general questions, explanations, and summaries",
|
||||
"models": ["openai/gpt-4o", "openai/gpt-4o-mini"],
|
||||
"selection_policy": {"prefer": "cheapest"}
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
The response includes a ranked `models` array. The client pattern is simple — use `models[0]`, fall back to `models[1]` on 429 or 5xx:
|
||||
|
||||
```python
|
||||
response = plano.routing_decision(request)
|
||||
|
||||
for model in response["models"]:
|
||||
try:
|
||||
result = call_llm(model, messages)
|
||||
break # success
|
||||
except (RateLimitError, ServerError):
|
||||
continue # try next
|
||||
```
|
||||
|
||||
The `/routing/v1/*` endpoints return routing decisions without forwarding to the LLM — useful for testing routing behavior, integrating with existing orchestration code, or implementing custom fallback logic.
|
||||
|
||||
## Production Deployment: Self-Hosted on Kubernetes
|
||||
|
||||
For teams that need routing decisions to stay within their cluster — regulatory requirements, data sovereignty, or simply avoiding external API dependencies — Arch-Router can be self-hosted using vLLM.
|
||||
|
||||
The deployment uses an init container to download quantized weights from HuggingFace, then serves the model via vLLM's OpenAI-compatible endpoint:
|
||||
|
||||
```yaml
|
||||
initContainers:
|
||||
- name: download-model
|
||||
image: python:3.11-slim
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
pip install huggingface_hub[cli] && \
|
||||
python -c "from huggingface_hub import snapshot_download; \
|
||||
snapshot_download('katanemo/Arch-Router-1.5B.gguf', \
|
||||
local_dir='/models/Arch-Router-1.5B.gguf')"
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
command:
|
||||
- vllm
|
||||
- serve
|
||||
- /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf
|
||||
- "--served-model-name"
|
||||
- "Arch-Router"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.3"
|
||||
- "--enable-prefix-caching"
|
||||
resources:
|
||||
requests:
|
||||
nvidia.com/gpu: "1"
|
||||
memory: "4Gi"
|
||||
```
|
||||
|
||||
GPU requirements are modest: a single L4 or L40S with 30% memory utilization. Prefix caching is enabled because route descriptions are constant across requests — the system prompt prefix is computed once and reused, cutting inference latency further.
|
||||
|
||||
The Plano config points to the in-cluster service:
|
||||
|
||||
```yaml
|
||||
overrides:
|
||||
llm_routing_model: plano/Arch-Router
|
||||
|
||||
model_providers:
|
||||
- model: plano/Arch-Router
|
||||
base_url: http://arch-router:10000
|
||||
```
|
||||
|
||||
For teams that don't want to manage GPU infrastructure, DigitalOcean's [GPU Droplets](https://www.digitalocean.com/products/gpu-droplets) provide single-click deployment of vLLM with NVIDIA L40S GPUs — spin up the Arch-Router as a managed inference endpoint without provisioning bare metal.
|
||||
|
||||
## What We Learned
|
||||
|
||||
Building and operating this in production surfaced a few non-obvious lessons:
|
||||
|
||||
**Purpose-built models beat general-purpose models for classification — if you have the training data.** A 1.5B model fine-tuned on routing decisions outperforms GPT-4 few-shot prompting on our benchmark, at 1/30th the cost and 1/20th the latency. The key is that routing is a narrow, well-defined task. You don't need a model that can write poetry to decide whether a query is about code or about cooking.
|
||||
|
||||
**Startup validation prevents an entire class of silent bugs.** Early versions logged warnings for misconfigured metrics sources. Users didn't notice the warnings, deployed to production, and spent hours debugging why "cheapest" routing wasn't actually routing by cost. Crashing at startup is better UX than silent degradation.
|
||||
|
||||
**The WASM no_std constraint produces cleaner code.** Not being able to reach for tokio or std::net forces a callback-driven architecture where every I/O operation is explicit. The resulting code is harder to write but trivially auditable — you can trace every external call from the code alone, without understanding a runtime.
|
||||
|
||||
**Live metrics ranking is more useful than static config because model performance drifts.** Provider latency varies by 2-3x throughout the day based on traffic patterns. A model that's "fastest" at 2am is often the slowest at 2pm. Refreshing Prometheus data every 60 seconds catches these shifts; static config doesn't.
|
||||
|
||||
---
|
||||
|
||||
The Model Routing Service is open source as part of [Plano](https://github.com/katanemo/plano). The complete demo, including Docker Compose, Kubernetes manifests, and example scripts, is at [`demos/llm_routing/model_routing_service/`](https://github.com/katanemo/plano/tree/main/demos/llm_routing/model_routing_service).
|
||||
76
docs/source/guides/observability/logging.rst
Normal file
76
docs/source/guides/observability/logging.rst
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
.. _logging:
|
||||
|
||||
Logging
|
||||
=======
|
||||
|
||||
Plano supports dynamic log level changes at runtime, allowing you to increase
|
||||
verbosity for debugging without restarting the service.
|
||||
|
||||
Setting the Log Level at Startup
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Set the ``LOG_LEVEL`` environment variable before starting Plano:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
LOG_LEVEL=debug planoai up config.yaml
|
||||
|
||||
This controls both the brightstaff service (``RUST_LOG``) and Envoy's WASM
|
||||
component log level.
|
||||
|
||||
Changing the Log Level at Runtime
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Use the ``planoai log-level`` command to change levels on a running instance:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Set both services to debug
|
||||
planoai log-level debug
|
||||
|
||||
# Set both services to info
|
||||
planoai log-level info
|
||||
|
||||
# Show current log levels
|
||||
planoai log-level --show
|
||||
|
||||
# For Docker-based instances
|
||||
planoai log-level debug --docker
|
||||
|
||||
The brightstaff service also accepts granular ``RUST_LOG``-style filters:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Debug for brightstaff crate only, info for everything else
|
||||
planoai log-level "brightstaff=debug,info"
|
||||
|
||||
Available log levels (from most to least verbose): ``trace``, ``debug``,
|
||||
``info``, ``warn``, ``error``.
|
||||
|
||||
Direct API Access
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
You can also change log levels directly via HTTP:
|
||||
|
||||
**Brightstaff** (port 9091, or 19091 in Docker mode):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Get current level
|
||||
curl http://localhost:9091/admin/log-level
|
||||
|
||||
# Set level
|
||||
curl -X PUT http://localhost:9091/admin/log-level -d "debug"
|
||||
|
||||
**Envoy** (port 9901, or 19901 in Docker mode):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# View all logger levels
|
||||
curl http://localhost:9901/logging
|
||||
|
||||
# Set all loggers to debug
|
||||
curl -X POST "http://localhost:9901/logging?level=debug"
|
||||
|
||||
# Set only WASM component to debug
|
||||
curl -X POST "http://localhost:9901/logging?wasm=debug"
|
||||
|
|
@ -9,3 +9,4 @@ Observability
|
|||
tracing
|
||||
monitoring
|
||||
access_logging
|
||||
logging
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue