add dynamic log config

This commit is contained in:
Adil Hafeez 2026-04-01 15:40:40 -07:00
parent f019f05738
commit f04c0b7cdd
9 changed files with 634 additions and 5 deletions

View file

@ -74,6 +74,7 @@ def docker_start_plano_detached(
port_mappings = [
"12001:12001",
"19901:9901",
"19091:9091",
]
for port in gateway_ports:

View file

@ -593,6 +593,82 @@ def cli_agent(type, file, path, settings):
sys.exit(1)
@click.command("log-level")
@click.argument("level", required=False)
@click.option(
"--show",
is_flag=True,
help="Show current log levels for both brightstaff and Envoy.",
)
@click.option(
"--docker",
default=False,
is_flag=True,
help="Target a Docker-based Plano instance.",
)
def log_level(level, show, docker):
"""Dynamically change the log level for a running Plano instance.
Sets the log level for both the brightstaff service and Envoy proxy.
LEVEL accepts standard log levels (trace, debug, info, warn, error) or
RUST_LOG-style filters (e.g. 'brightstaff=debug,info').
"""
import requests as req
brightstaff_port = 19091 if docker else 9091
envoy_admin_port = 19901 if docker else 9901
brightstaff_url = f"http://localhost:{brightstaff_port}/admin/log-level"
envoy_url = f"http://localhost:{envoy_admin_port}/logging"
console = _console()
if show or not level:
# Show current log levels
try:
resp = req.get(brightstaff_url, timeout=3)
data = resp.json()
console.print(
f" brightstaff: [bold]{data.get('level', 'unknown')}[/bold]"
)
except Exception:
console.print(" brightstaff: [dim]unavailable[/dim]")
try:
resp = req.get(envoy_url, timeout=3)
console.print(f" envoy: [dim](see {envoy_url} for per-logger levels)[/dim]")
except Exception:
console.print(" envoy: [dim]unavailable[/dim]")
return
# Set log level on both services
errors = []
try:
resp = req.put(brightstaff_url, data=level, timeout=3)
if resp.status_code == 200:
console.print(f" brightstaff → [bold]{level}[/bold]")
else:
err = resp.json().get("error", resp.text)
console.print(f" brightstaff: [red]error[/red] — {err}")
errors.append("brightstaff")
except Exception as e:
console.print(f" brightstaff: [red]unavailable[/red] — {e}")
errors.append("brightstaff")
try:
resp = req.post(f"{envoy_url}?level={level}", timeout=3)
if resp.status_code == 200:
console.print(f" envoy → [bold]{level}[/bold]")
else:
console.print(f" envoy: [red]error[/red] — {resp.text.strip()}")
errors.append("envoy")
except Exception as e:
console.print(f" envoy: [red]unavailable[/red] — {e}")
errors.append("envoy")
if errors:
sys.exit(1)
# add commands to the main group
main.add_command(up)
main.add_command(down)
@ -602,6 +678,7 @@ main.add_command(cli_agent)
main.add_command(generate_prompt_targets)
main.add_command(init_cmd, name="init")
main.add_command(trace_cmd, name="trace")
main.add_command(log_level)
if __name__ == "__main__":
main()

View file

@ -11,7 +11,7 @@ use brightstaff::router::orchestrator::OrchestratorService;
use brightstaff::state::memory::MemoryConversationalStorage;
use brightstaff::state::postgresql::PostgreSQLConversationStorage;
use brightstaff::state::StateStorage;
use brightstaff::tracing::init_tracer;
use brightstaff::tracing::{get_log_level, init_tracer, set_log_level};
use bytes::Bytes;
use common::configuration::{
Agent, Configuration, FilterPipeline, ListenerType, ResolvedFilterChain,
@ -384,6 +384,59 @@ async fn init_state_storage(
Ok(Some(storage))
}
// ---------------------------------------------------------------------------
// Admin handlers
// ---------------------------------------------------------------------------
use http_body_util::BodyExt;
fn json_response(
status: StatusCode,
body: &str,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let bytes = Bytes::from(body.to_string());
let body = http_body_util::Full::new(bytes)
.map_err(|never| match never {})
.boxed();
let mut resp = Response::new(body);
*resp.status_mut() = status;
resp.headers_mut()
.insert("Content-Type", HeaderValue::from_static("application/json"));
Ok(resp)
}
async fn handle_get_log_level() -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
match get_log_level() {
Some(level) => json_response(StatusCode::OK, &format!("{{\"level\":\"{level}\"}}")),
None => json_response(
StatusCode::INTERNAL_SERVER_ERROR,
"{\"error\":\"tracer not initialized\"}",
),
}
}
async fn handle_set_log_level(
req: Request<Incoming>,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let body = req.collect().await.map_err(|_| ()).unwrap();
let new_level = String::from_utf8_lossy(&body.to_bytes()).trim().to_string();
if new_level.is_empty() {
return json_response(
StatusCode::BAD_REQUEST,
"{\"error\":\"body must contain a log level filter, e.g. 'debug'\"}",
);
}
match set_log_level(&new_level) {
Ok(()) => {
info!(level = %new_level, "log level updated");
json_response(StatusCode::OK, &format!("{{\"level\":\"{new_level}\"}}"))
}
Err(e) => json_response(StatusCode::BAD_REQUEST, &format!("{{\"error\":\"{e}\"}}")),
}
}
// ---------------------------------------------------------------------------
// Request routing
// ---------------------------------------------------------------------------
@ -426,6 +479,13 @@ async fn route(
}
}
// --- Admin routes ---
match (req.method(), path.as_str()) {
(&Method::GET, "/admin/log-level") => return handle_get_log_level().await,
(&Method::PUT, "/admin/log-level") => return handle_set_log_level(req).await,
_ => {}
}
// --- Standard routes ---
match (req.method(), path.as_str()) {
(&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH) => {

View file

@ -8,12 +8,38 @@ use tracing::{Event, Subscriber};
use tracing_subscriber::fmt::{format, time::FormatTime, FmtContext, FormatEvent, FormatFields};
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::registry::LookupSpan;
use tracing_subscriber::reload;
use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::EnvFilter;
use super::ServiceNameOverrideExporter;
use common::configuration::Tracing;
type ReloadHandle = reload::Handle<EnvFilter, tracing_subscriber::Registry>;
static LOG_LEVEL_HANDLE: OnceLock<ReloadHandle> = OnceLock::new();
/// Dynamically change the log level filter at runtime.
///
/// Accepts any valid `RUST_LOG` / `EnvFilter` syntax, e.g. `"debug"`,
/// `"brightstaff=trace,info"`.
pub fn set_log_level(new_filter: &str) -> Result<(), String> {
let handle = LOG_LEVEL_HANDLE
.get()
.ok_or_else(|| "tracer not initialized".to_string())?;
let filter = EnvFilter::try_new(new_filter)
.map_err(|e| format!("invalid filter '{new_filter}': {e}"))?;
handle
.reload(filter)
.map_err(|e| format!("failed to reload filter: {e}"))
}
/// Returns the current log level filter string, if the tracer is initialized.
pub fn get_log_level() -> Option<String> {
let handle = LOG_LEVEL_HANDLE.get()?;
handle.with_current(|f| f.to_string()).ok()
}
struct BracketedTime;
impl FormatTime for BracketedTime {
@ -118,9 +144,10 @@ pub fn init_tracer(tracing_config: Option<&Tracing>) -> &'static SdkTracerProvid
let telemetry_layer =
tracing_opentelemetry::layer().with_tracer(provider.tracer("brightstaff"));
// Combine the OpenTelemetry layer with fmt layer using the registry
let env_filter =
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"));
let (filter_layer, reload_handle) = reload::Layer::new(env_filter);
LOG_LEVEL_HANDLE.set(reload_handle).ok();
// Create fmt layer with span field formatting enabled (no ANSI to keep fields parseable)
let fmt_layer = tracing_subscriber::fmt::layer()
@ -129,8 +156,8 @@ pub fn init_tracer(tracing_config: Option<&Tracing>) -> &'static SdkTracerProvid
.with_ansi(false);
let subscriber = tracing_subscriber::registry()
.with(filter_layer)
.with(telemetry_layer)
.with(env_filter)
.with(fmt_layer);
tracing::subscriber::set_global_default(subscriber)
@ -144,6 +171,8 @@ pub fn init_tracer(tracing_config: Option<&Tracing>) -> &'static SdkTracerProvid
let env_filter =
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"));
let (filter_layer, reload_handle) = reload::Layer::new(env_filter);
LOG_LEVEL_HANDLE.set(reload_handle).ok();
// Create fmt layer with span field formatting enabled (no ANSI to keep fields parseable)
let fmt_layer = tracing_subscriber::fmt::layer()
@ -152,7 +181,7 @@ pub fn init_tracer(tracing_config: Option<&Tracing>) -> &'static SdkTracerProvid
.with_ansi(false);
tracing_subscriber::registry()
.with(env_filter)
.with(filter_layer)
.with(fmt_layer)
.init();

View file

@ -7,7 +7,7 @@ pub use constants::{
error, http, llm, operation_component, routing, signals, OperationNameBuilder,
};
pub use custom_attributes::collect_custom_trace_attributes;
pub use init::init_tracer;
pub use init::{get_log_level, init_tracer, set_log_level};
pub use service_name_exporter::{ServiceNameOverrideExporter, SERVICE_NAME_OVERRIDE_KEY};
use opentelemetry::trace::get_active_span;

View file

@ -0,0 +1,27 @@
version: v0.4.0
overrides:
llm_routing_model: plano/Plano-Orchestrator
agent_orchestration_model: plano/Plano-Orchestrator
listeners:
- type: model
name: model_listener
port: 12000
model_providers:
- model: plano/Plano-Orchestrator
base_url: http://plano-orchestrator:10001
passthrough_auth: true
model_metrics_sources:
- type: cost
provider: digitalocean
refresh_interval: 3600
- type: latency
provider: prometheus
url: http://metrics-kube-prometheus-st-prometheus:9090
query: histogram_quantile(0.95, sum by (model_name, le) (rate(inference_proxy_inference_client_ttft_duration_bucket[5m])))
refresh_interval: 300

View file

@ -0,0 +1,358 @@
# Intent-Aware LLM Routing at Infrastructure Speed: How We Built a Purpose-Built 1.5B Router
*Adil Hafeez | April 2026*
---
Every team running multi-model LLM infrastructure eventually hits the same problem: you have five providers, each with different cost and latency profiles, and the right model for a coding question is not the right model for a summarization task. How do you route each request to the best model — without adding seconds of latency or dollars of cost to every call?
We built [Plano](https://github.com/katanemo/plano), an open-source AI-native proxy built on Envoy, to solve this at the infrastructure layer. This post is a deep dive into one specific piece: the **Model Routing Service** — how we use a purpose-built 1.5B parameter model to classify user intent in ~50ms and rank candidate models using live cost and latency data.
## The Routing Problem
When your application talks to multiple LLM providers, you need a routing decision on every request. Teams typically reach for one of three approaches, and each breaks down in a predictable way.
**Keyword and regex matching** is the first instinct. Match "write a function" to the code model, "explain" to the chat model. It's fast — effectively zero latency — but brittle. "Can you code this up?" doesn't match "write a function," and maintenance cost scales linearly with your vocabulary. Every new phrasing requires a new rule.
**Using a frontier model as a classifier** is the next step. Send the user's message to GPT-4 or Claude with a system prompt like "classify this as code_generation or general_question." It works well — frontier models are excellent at intent classification. But you're spending $0.010.03 and 500ms2s per classification call. You're paying a frontier model just to decide which frontier model to call.
**Static rules and load balancing** ignore semantics entirely. Round-robin across a model pool, or route by endpoint path. A complex reasoning question and a simple chat message hit the same model. You're either overpaying (sending everything to the expensive model) or underserving (sending everything to the cheap one).
The gap is clear: we needed something that understands intent like a frontier model but runs at infrastructure speed and infrastructure cost.
## Arch-Router: A Purpose-Built 1.5B Classification Model
Rather than repurposing a general-purpose LLM, we trained a dedicated model for one job: given a conversation and a set of route descriptions, return the name of the best-matching route.
[Arch-Router](https://huggingface.co/katanemo/Arch-Router-1.5B) is a 1.5B parameter model fine-tuned specifically for routing classification. It's not a chat model — it doesn't generate prose, explain its reasoning, or handle follow-up questions. It reads a conversation, compares it against route descriptions, and emits a JSON object: `{"route": "code_generation"}` or `{"route": "other"}` if nothing matches.
**Why 1.5B parameters?** We evaluated models across three orders of magnitude. At 125M parameters, accuracy drops sharply on ambiguous queries — "help me with this code" could be generation or debugging, and smaller models can't reliably distinguish based on conversational context. At 7B+ parameters, accuracy improves marginally (<2% on our benchmark) but latency doubles and GPU memory requirements triple. 1.5B is the inflection point: accurate enough for production routing, small enough to run on a single GPU with 30% memory utilization.
For deployment, we quantize to **Q4_K_M GGUF format**, which keeps GPU memory at ~2GB and enables serving via [vLLM](https://github.com/vllm-project/vllm) with prefix caching enabled. The quantized model maintains classification accuracy within 1% of the full-precision version on our routing benchmark.
### How the Prompt Works
The system prompt uses XML-tagged route descriptions — a deliberate choice over JSON because small models handle XML boundary tokens more reliably:
```
You are a helpful assistant designed to find the best suited route.
You are provided with route description within <routes></routes> XML tags:
<routes>
{routes}
</routes>
<conversation>
{conversation}
</conversation>
Your task is to decide which route is best suit with user intent on the
conversation in <conversation></conversation> XML tags. Follow the instruction:
1. If the latest intent from user is irrelevant or user intent is full filled,
response with other route {"route": "other"}.
2. You must analyze the route descriptions and find the best match route for
user latest intent.
3. You only response the name of the route that best matches the user's request,
use the exact name in the <routes></routes>.
Based on your analysis, provide your response in the following JSON formats if
you decide to match any route:
{"route": "route_name"}
```
The `{routes}` placeholder is populated from the YAML configuration — each route has a name and a natural-language description. The `{conversation}` placeholder gets the user's messages, with system messages and tool calls filtered out to focus on user intent. We cap input at 2048 tokens; routing decisions should be based on recent context, not entire conversation histories.
This is binary classification per route, not N-way. The model evaluates each route description against the conversation and picks the best match. If nothing fits, it returns `"other"` and the request falls through to the default model.
We also trained a variant called **Plano-Orchestrator** for multi-agent scenarios, where the model returns an array of matching routes: `{"route": ["research_agent", "code_agent"]}`. Same architecture, different training objective.
## The Ranking Engine: Live Cost and Latency Data
Knowing the right *route* is only half the problem. Within a route, you might have three candidate models — and the best one depends on whether you're optimizing for cost or latency right now. Static ordering doesn't cut it because model pricing changes, latency drifts with load, and rate limits shift availability.
Plano's `ModelMetricsService` continuously fetches cost and latency data from external sources, then ranks candidate models at request time.
The core ranking function is straightforward:
```rust
pub async fn rank_models(&self, models: &[String], policy: &SelectionPolicy) -> Vec<String> {
match policy.prefer {
SelectionPreference::Cheapest => {
let data = self.cost.read().await;
rank_by_ascending_metric(models, &data)
}
SelectionPreference::Fastest => {
let data = self.latency.read().await;
rank_by_ascending_metric(models, &data)
}
SelectionPreference::Random => shuffle(models),
SelectionPreference::None => models.to_vec(),
}
}
```
Models with no metric data get appended last — they're still available as fallback but won't be preferred. The system logs a warning both at startup and per-request when a model has no data, so you can catch misconfigurations early.
### Metrics Sources
**Cost data** is fetched from DigitalOcean's public Gen-AI pricing API, which requires no authentication and returns input/output pricing per million tokens for all models in the catalog. We compute a single cost scalar as `input_price_per_million + output_price_per_million` — only relative ordering matters, not absolute numbers.
**Latency data** comes from Prometheus. You provide a PromQL query that returns an instant vector with a `model_name` label — typically a P95 histogram quantile over your actual traffic. The system re-fetches on a configurable interval (default: 60s for latency, 3600s for cost).
A `model_aliases` map bridges naming differences. DigitalOcean's catalog uses `openai-gpt-4o`; your config might use `openai/gpt-4o`. The alias map handles this without changing your routing configuration.
### Fail-Fast Validation
Plano validates metric source configuration at startup and exits with a clear error if the setup is inconsistent:
| Condition | Error |
|---|---|
| `prefer: cheapest` with no cost source | `requires a cost metrics source` |
| `prefer: fastest` with no latency source | `requires a latency metrics source` |
This is a deliberate design choice. Misconfigured routing that silently falls back to default ordering is worse than a startup crash — you'd spend hours debugging why your "cheapest" policy is serving GPT-4o before GPT-4o-mini.
## Architecture: Why Envoy, WASM, and Async Rust
The routing service doesn't exist in isolation. It runs inside Plano's three-layer architecture, and the choice of each layer directly affects routing performance.
```
Client ──► Envoy (llm_gateway.wasm) ──► Brightstaff ──► LLM Providers
Arch-Router (1.5B)
Metrics Service
```
### Layer 1: Envoy as Transport Substrate
We don't implement TLS, connection pooling, retries, circuit breaking, or HTTP/2 multiplexing. Envoy does all of this, battle-tested across deployments at Google, Lyft, and thousands of other production environments. Building a custom HTTP server to handle LLM traffic would mean reimplementing solved infrastructure problems — and getting them wrong in subtle ways under load.
Envoy's threading model matters here: one event-loop worker per CPU core, each connection pinned to a single worker. There's no lock contention in the hot path. For streaming LLM responses — which are long-lived, chunked HTTP connections — this model scales naturally. We're building on Envoy because we were early contributors to the project and understand its extension points deeply.
### Layer 2: LLM Gateway (WASM Plugin)
The `llm_gateway.wasm` filter runs inside Envoy's process — not as a sidecar, not as a separate service. It handles format translation between providers (OpenAI, Anthropic, Gemini, Mistral, Groq, DeepSeek, xAI, Bedrock) at wire speed with zero network hop.
The WASM sandbox imposes a strict constraint: **no std networking, no tokio, no async runtime**. Everything is `dispatch_http_call()` with a callback. All dependencies must be `no_std`-compatible. This is painful to develop against, but it produces a cleaner separation between I/O and logic — and the resulting binary is tiny (single-digit MBs) with a predictable memory footprint.
The format translation layer is powered by `hermesllm`, our Rust crate for LLM API abstraction. Adding a new provider means implementing `ProviderRequest` and `ProviderResponse` traits — the router and gateway don't need to change.
### Layer 3: Brightstaff (Native Async Rust)
The routing logic — `RouterService`, `ModelMetricsService`, OTEL tracing — lives in Brightstaff, a native Rust binary running on the Tokio async runtime alongside Envoy. One lightweight Tokio task per request, not one OS thread. This handles thousands of concurrent routing decisions on modest hardware.
**Why Rust?** In a proxy that handles streaming LLM responses, garbage collector pauses cause visible stutter in token delivery. Go's GC pause (typically 0.1-1ms) is fine for most applications but noticeable in a token stream delivering chunks every 20-50ms. Rust's ownership model eliminates this class of bugs entirely — no GC, no pauses, predictable latency.
## Running the Model Routing Service
Here's the complete setup from our [demo](https://github.com/katanemo/plano/tree/main/demos/llm_routing/model_routing_service). The config defines two routes with different ranking strategies and two metrics sources:
```yaml
version: v0.4.0
listeners:
- type: model
name: model_listener
port: 12000
model_providers:
- model: openai/gpt-4o-mini
access_key: $OPENAI_API_KEY
default: true
- model: openai/gpt-4o
access_key: $OPENAI_API_KEY
- model: anthropic/claude-sonnet-4-20250514
access_key: $ANTHROPIC_API_KEY
routing_preferences:
- name: complex_reasoning
description: complex reasoning tasks, multi-step analysis, or detailed explanations
models:
- openai/gpt-4o
- openai/gpt-4o-mini
selection_policy:
prefer: cheapest
- name: code_generation
description: generating new code, writing functions, or creating boilerplate
models:
- anthropic/claude-sonnet-4-20250514
- openai/gpt-4o
selection_policy:
prefer: fastest
model_metrics_sources:
- type: cost
provider: digitalocean
refresh_interval: 3600
model_aliases:
openai-gpt-4o: openai/gpt-4o
openai-gpt-4o-mini: openai/gpt-4o-mini
anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
- type: latency
provider: prometheus
url: http://localhost:9090
query: model_latency_p95_seconds
refresh_interval: 60
```
Start the metrics infrastructure and Plano:
```bash
# Start Prometheus + mock metrics server
docker compose up -d
# Start Plano
planoai up config.yaml
```
### Code Generation: Ranked by Latency
A coding request hits the `code_generation` route. With `prefer: fastest`, the metrics service checks P95 latencies from Prometheus — Claude-Sonnet at 0.85s beats GPT-4o at 1.20s:
```bash
curl -s http://localhost:12000/routing/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4o-mini",
"messages": [
{"role": "user", "content": "Write a Python function that implements binary search on a sorted array"}
]
}'
```
```json
{
"models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"],
"route": "code_generation",
"trace_id": "c16d1096c1af4a17abb48fb182918a88"
}
```
### Complex Reasoning: Ranked by Cost
A reasoning request hits `complex_reasoning` with `prefer: cheapest`. DigitalOcean pricing puts GPT-4o-mini ($0.75/M tokens) well ahead of GPT-4o ($25/M):
```bash
curl -s http://localhost:12000/routing/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4o-mini",
"messages": [
{"role": "user", "content": "Explain the trade-offs between microservices and monolithic architectures, considering scalability, team structure, and operational complexity"}
]
}'
```
```json
{
"models": ["openai/gpt-4o-mini", "openai/gpt-4o"],
"route": "complex_reasoning",
"trace_id": "..."
}
```
### Per-Request Overrides
Config-level preferences set the default, but individual requests can override them with an inline `routing_preferences` field. This is stripped from the request before forwarding upstream — downstream providers never see it:
```bash
curl -s http://localhost:12000/routing/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4o-mini",
"messages": [
{"role": "user", "content": "Summarize the key differences between TCP and UDP"}
],
"routing_preferences": [
{
"name": "general",
"description": "general questions, explanations, and summaries",
"models": ["openai/gpt-4o", "openai/gpt-4o-mini"],
"selection_policy": {"prefer": "cheapest"}
}
]
}'
```
The response includes a ranked `models` array. The client pattern is simple — use `models[0]`, fall back to `models[1]` on 429 or 5xx:
```python
response = plano.routing_decision(request)
for model in response["models"]:
try:
result = call_llm(model, messages)
break # success
except (RateLimitError, ServerError):
continue # try next
```
The `/routing/v1/*` endpoints return routing decisions without forwarding to the LLM — useful for testing routing behavior, integrating with existing orchestration code, or implementing custom fallback logic.
## Production Deployment: Self-Hosted on Kubernetes
For teams that need routing decisions to stay within their cluster — regulatory requirements, data sovereignty, or simply avoiding external API dependencies — Arch-Router can be self-hosted using vLLM.
The deployment uses an init container to download quantized weights from HuggingFace, then serves the model via vLLM's OpenAI-compatible endpoint:
```yaml
initContainers:
- name: download-model
image: python:3.11-slim
command:
- sh
- -c
- |
pip install huggingface_hub[cli] && \
python -c "from huggingface_hub import snapshot_download; \
snapshot_download('katanemo/Arch-Router-1.5B.gguf', \
local_dir='/models/Arch-Router-1.5B.gguf')"
containers:
- name: vllm
image: vllm/vllm-openai:latest
command:
- vllm
- serve
- /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf
- "--served-model-name"
- "Arch-Router"
- "--gpu-memory-utilization"
- "0.3"
- "--enable-prefix-caching"
resources:
requests:
nvidia.com/gpu: "1"
memory: "4Gi"
```
GPU requirements are modest: a single L4 or L40S with 30% memory utilization. Prefix caching is enabled because route descriptions are constant across requests — the system prompt prefix is computed once and reused, cutting inference latency further.
The Plano config points to the in-cluster service:
```yaml
overrides:
llm_routing_model: plano/Arch-Router
model_providers:
- model: plano/Arch-Router
base_url: http://arch-router:10000
```
For teams that don't want to manage GPU infrastructure, DigitalOcean's [GPU Droplets](https://www.digitalocean.com/products/gpu-droplets) provide single-click deployment of vLLM with NVIDIA L40S GPUs — spin up the Arch-Router as a managed inference endpoint without provisioning bare metal.
## What We Learned
Building and operating this in production surfaced a few non-obvious lessons:
**Purpose-built models beat general-purpose models for classification — if you have the training data.** A 1.5B model fine-tuned on routing decisions outperforms GPT-4 few-shot prompting on our benchmark, at 1/30th the cost and 1/20th the latency. The key is that routing is a narrow, well-defined task. You don't need a model that can write poetry to decide whether a query is about code or about cooking.
**Startup validation prevents an entire class of silent bugs.** Early versions logged warnings for misconfigured metrics sources. Users didn't notice the warnings, deployed to production, and spent hours debugging why "cheapest" routing wasn't actually routing by cost. Crashing at startup is better UX than silent degradation.
**The WASM no_std constraint produces cleaner code.** Not being able to reach for tokio or std::net forces a callback-driven architecture where every I/O operation is explicit. The resulting code is harder to write but trivially auditable — you can trace every external call from the code alone, without understanding a runtime.
**Live metrics ranking is more useful than static config because model performance drifts.** Provider latency varies by 2-3x throughout the day based on traffic patterns. A model that's "fastest" at 2am is often the slowest at 2pm. Refreshing Prometheus data every 60 seconds catches these shifts; static config doesn't.
---
The Model Routing Service is open source as part of [Plano](https://github.com/katanemo/plano). The complete demo, including Docker Compose, Kubernetes manifests, and example scripts, is at [`demos/llm_routing/model_routing_service/`](https://github.com/katanemo/plano/tree/main/demos/llm_routing/model_routing_service).

View file

@ -0,0 +1,76 @@
.. _logging:
Logging
=======
Plano supports dynamic log level changes at runtime, allowing you to increase
verbosity for debugging without restarting the service.
Setting the Log Level at Startup
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Set the ``LOG_LEVEL`` environment variable before starting Plano:
.. code-block:: bash
LOG_LEVEL=debug planoai up config.yaml
This controls both the brightstaff service (``RUST_LOG``) and Envoy's WASM
component log level.
Changing the Log Level at Runtime
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Use the ``planoai log-level`` command to change levels on a running instance:
.. code-block:: bash
# Set both services to debug
planoai log-level debug
# Set both services to info
planoai log-level info
# Show current log levels
planoai log-level --show
# For Docker-based instances
planoai log-level debug --docker
The brightstaff service also accepts granular ``RUST_LOG``-style filters:
.. code-block:: bash
# Debug for brightstaff crate only, info for everything else
planoai log-level "brightstaff=debug,info"
Available log levels (from most to least verbose): ``trace``, ``debug``,
``info``, ``warn``, ``error``.
Direct API Access
~~~~~~~~~~~~~~~~~
You can also change log levels directly via HTTP:
**Brightstaff** (port 9091, or 19091 in Docker mode):
.. code-block:: bash
# Get current level
curl http://localhost:9091/admin/log-level
# Set level
curl -X PUT http://localhost:9091/admin/log-level -d "debug"
**Envoy** (port 9901, or 19901 in Docker mode):
.. code-block:: bash
# View all logger levels
curl http://localhost:9901/logging
# Set all loggers to debug
curl -X POST "http://localhost:9901/logging?level=debug"
# Set only WASM component to debug
curl -X POST "http://localhost:9901/logging?wasm=debug"

View file

@ -9,3 +9,4 @@ Observability
tracing
monitoring
access_logging
logging