Merge branch 'main' into adil/release-0.4.15

This commit is contained in:
Adil Hafeez 2026-03-30 17:22:34 -07:00
commit 21aa91551d
7 changed files with 171 additions and 455 deletions

View file

@ -502,7 +502,6 @@ properties:
- name
- description
- models
- selection_policy
model_metrics_sources:
type: array
@ -512,52 +511,11 @@ properties:
properties:
type:
type: string
const: cost_metrics
url:
const: cost
provider:
type: string
refresh_interval:
type: integer
minimum: 1
auth:
type: object
properties:
type:
type: string
enum:
- bearer
token:
type: string
required:
- type
- token
additionalProperties: false
required:
- type
- url
additionalProperties: false
- type: object
properties:
type:
type: string
const: prometheus_metrics
url:
type: string
query:
type: string
refresh_interval:
type: integer
minimum: 1
description: "Refresh interval in seconds"
required:
- type
- url
- query
additionalProperties: false
- type: object
properties:
type:
type: string
const: digitalocean_pricing
enum:
- digitalocean
refresh_interval:
type: integer
minimum: 1
@ -569,6 +527,30 @@ properties:
type: string
required:
- type
- provider
additionalProperties: false
- type: object
properties:
type:
type: string
const: latency
provider:
type: string
enum:
- prometheus
url:
type: string
query:
type: string
refresh_interval:
type: integer
minimum: 1
description: "Refresh interval in seconds"
required:
- type
- provider
- url
- query
additionalProperties: false
additionalProperties: false

View file

@ -216,33 +216,17 @@ async fn init_app_state(
use common::configuration::MetricsSource;
let cost_count = sources
.iter()
.filter(|s| matches!(s, MetricsSource::CostMetrics { .. }))
.filter(|s| matches!(s, MetricsSource::Cost(_)))
.count();
let prom_count = sources
let latency_count = sources
.iter()
.filter(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }))
.count();
let do_count = sources
.iter()
.filter(|s| matches!(s, MetricsSource::DigitalOceanPricing { .. }))
.filter(|s| matches!(s, MetricsSource::Latency(_)))
.count();
if cost_count > 1 {
return Err("model_metrics_sources: only one cost_metrics source is allowed".into());
return Err("model_metrics_sources: only one cost metrics source is allowed".into());
}
if prom_count > 1 {
return Err(
"model_metrics_sources: only one prometheus_metrics source is allowed".into(),
);
}
if do_count > 1 {
return Err(
"model_metrics_sources: only one digitalocean_pricing source is allowed".into(),
);
}
if cost_count > 0 && do_count > 0 {
return Err(
"model_metrics_sources: cost_metrics and digitalocean_pricing cannot both be configured — use one or the other".into(),
);
if latency_count > 1 {
return Err("model_metrics_sources: only one latency metrics source is allowed".into());
}
let svc = ModelMetricsService::new(sources, reqwest::Client::new()).await;
Some(Arc::new(svc))
@ -259,32 +243,27 @@ async fn init_app_state(
.as_deref()
.unwrap_or_default()
.iter()
.any(|s| {
matches!(
s,
MetricsSource::CostMetrics { .. } | MetricsSource::DigitalOceanPricing { .. }
)
});
let has_prometheus = config
.any(|s| matches!(s, MetricsSource::Cost(_)));
let has_latency_source = config
.model_metrics_sources
.as_deref()
.unwrap_or_default()
.iter()
.any(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }));
.any(|s| matches!(s, MetricsSource::Latency(_)));
for pref in prefs {
if pref.selection_policy.prefer == SelectionPreference::Cheapest && !has_cost_source {
return Err(format!(
"routing_preferences route '{}' uses prefer: cheapest but no cost data source is configured — \
add cost_metrics or digitalocean_pricing to model_metrics_sources",
"routing_preferences route '{}' uses prefer: cheapest but no cost metrics source is configured — \
add a cost metrics source to model_metrics_sources",
pref.name
)
.into());
}
if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_prometheus {
if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_latency_source {
return Err(format!(
"routing_preferences route '{}' uses prefer: fastest but no prometheus_metrics source is configured — \
add prometheus_metrics to model_metrics_sources",
"routing_preferences route '{}' uses prefer: fastest but no latency metrics source is configured — \
add a latency metrics source to model_metrics_sources",
pref.name
)
.into());

View file

@ -2,9 +2,11 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use common::configuration::{MetricsSource, SelectionPolicy, SelectionPreference};
use common::configuration::{
CostProvider, LatencyProvider, MetricsSource, SelectionPolicy, SelectionPreference,
};
use tokio::sync::RwLock;
use tracing::{info, warn};
use tracing::{debug, info, warn};
const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog";
@ -20,81 +22,52 @@ impl ModelMetricsService {
for source in sources {
match source {
MetricsSource::CostMetrics {
url,
refresh_interval,
auth,
} => {
let data = fetch_cost_metrics(url, auth.as_ref(), &client).await;
info!(models = data.len(), url = %url, "fetched cost metrics");
*cost_data.write().await = data;
MetricsSource::Cost(cfg) => match cfg.provider {
CostProvider::Digitalocean => {
let aliases = cfg.model_aliases.clone().unwrap_or_default();
let data = fetch_do_pricing(&client, &aliases).await;
info!(models = data.len(), "fetched digitalocean pricing");
*cost_data.write().await = data;
if let Some(interval_secs) = refresh_interval {
let cost_clone = Arc::clone(&cost_data);
let client_clone = client.clone();
let url = url.clone();
let auth = auth.clone();
let interval = Duration::from_secs(*interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data =
fetch_cost_metrics(&url, auth.as_ref(), &client_clone).await;
info!(models = data.len(), url = %url, "refreshed cost metrics");
*cost_clone.write().await = data;
}
});
if let Some(interval_secs) = cfg.refresh_interval {
let cost_clone = Arc::clone(&cost_data);
let client_clone = client.clone();
let interval = Duration::from_secs(interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data = fetch_do_pricing(&client_clone, &aliases).await;
info!(models = data.len(), "refreshed digitalocean pricing");
*cost_clone.write().await = data;
}
});
}
}
}
MetricsSource::PrometheusMetrics {
url,
query,
refresh_interval,
} => {
let data = fetch_prometheus_metrics(url, query, &client).await;
info!(models = data.len(), url = %url, "fetched prometheus latency metrics");
*latency_data.write().await = data;
},
MetricsSource::Latency(cfg) => match cfg.provider {
LatencyProvider::Prometheus => {
let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await;
info!(models = data.len(), url = %cfg.url, "fetched latency metrics");
*latency_data.write().await = data;
if let Some(interval_secs) = refresh_interval {
let latency_clone = Arc::clone(&latency_data);
let client_clone = client.clone();
let url = url.clone();
let query = query.clone();
let interval = Duration::from_secs(*interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data =
fetch_prometheus_metrics(&url, &query, &client_clone).await;
info!(models = data.len(), url = %url, "refreshed prometheus latency metrics");
*latency_clone.write().await = data;
}
});
if let Some(interval_secs) = cfg.refresh_interval {
let latency_clone = Arc::clone(&latency_data);
let client_clone = client.clone();
let url = cfg.url.clone();
let query = cfg.query.clone();
let interval = Duration::from_secs(interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data =
fetch_prometheus_metrics(&url, &query, &client_clone).await;
info!(models = data.len(), url = %url, "refreshed latency metrics");
*latency_clone.write().await = data;
}
});
}
}
}
MetricsSource::DigitalOceanPricing {
refresh_interval,
model_aliases,
} => {
let aliases = model_aliases.clone().unwrap_or_default();
let data = fetch_do_pricing(&client, &aliases).await;
info!(models = data.len(), "fetched digitalocean pricing");
*cost_data.write().await = data;
if let Some(interval_secs) = refresh_interval {
let cost_clone = Arc::clone(&cost_data);
let client_clone = client.clone();
let interval = Duration::from_secs(*interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data = fetch_do_pricing(&client_clone, &aliases).await;
info!(models = data.len(), "refreshed digitalocean pricing");
*cost_clone.write().await = data;
}
});
}
}
},
}
}
@ -107,24 +80,32 @@ impl ModelMetricsService {
/// Rank `models` by `policy`, returning them in preference order.
/// Models with no metric data are appended at the end in their original order.
pub async fn rank_models(&self, models: &[String], policy: &SelectionPolicy) -> Vec<String> {
let cost_data = self.cost.read().await;
let latency_data = self.latency.read().await;
debug!(
input_models = ?models,
cost_data = ?cost_data.iter().collect::<Vec<_>>(),
latency_data = ?latency_data.iter().collect::<Vec<_>>(),
prefer = ?policy.prefer,
"rank_models called"
);
match policy.prefer {
SelectionPreference::Cheapest => {
let data = self.cost.read().await;
for m in models {
if !data.contains_key(m.as_str()) {
if !cost_data.contains_key(m.as_str()) {
warn!(model = %m, "no cost data for model — ranking last (prefer: cheapest)");
}
}
rank_by_ascending_metric(models, &data)
rank_by_ascending_metric(models, &cost_data)
}
SelectionPreference::Fastest => {
let data = self.latency.read().await;
for m in models {
if !data.contains_key(m.as_str()) {
if !latency_data.contains_key(m.as_str()) {
warn!(model = %m, "no latency data for model — ranking last (prefer: fastest)");
}
}
rank_by_ascending_metric(models, &data)
rank_by_ascending_metric(models, &latency_data)
}
SelectionPreference::None => models.to_vec(),
}
@ -144,13 +125,20 @@ impl ModelMetricsService {
fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> Vec<String> {
let mut with_data: Vec<(&String, f64)> = models
.iter()
.filter_map(|m| data.get(m.as_str()).map(|v| (m, *v)))
.filter_map(|m| {
let v = *data.get(m.as_str())?;
if v.is_nan() {
None
} else {
Some((m, v))
}
})
.collect();
with_data.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let without_data: Vec<&String> = models
.iter()
.filter(|m| !data.contains_key(m.as_str()))
.filter(|m| data.get(m.as_str()).is_none_or(|v| v.is_nan()))
.collect();
with_data
@ -160,43 +148,6 @@ fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> V
.collect()
}
#[derive(serde::Deserialize)]
struct CostEntry {
input_per_million: f64,
output_per_million: f64,
}
async fn fetch_cost_metrics(
url: &str,
auth: Option<&common::configuration::MetricsAuth>,
client: &reqwest::Client,
) -> HashMap<String, f64> {
let mut req = client.get(url);
if let Some(auth) = auth {
if auth.auth_type == "bearer" {
req = req.header("Authorization", format!("Bearer {}", auth.token));
} else {
warn!(auth_type = %auth.auth_type, "unsupported auth type for cost_metrics, skipping auth");
}
}
match req.send().await {
Ok(resp) => match resp.json::<HashMap<String, CostEntry>>().await {
Ok(data) => data
.into_iter()
.map(|(k, v)| (k, v.input_per_million + v.output_per_million))
.collect(),
Err(err) => {
warn!(error = %err, url = %url, "failed to parse cost metrics response");
HashMap::new()
}
},
Err(err) => {
warn!(error = %err, url = %url, "failed to fetch cost metrics");
HashMap::new()
}
}
}
#[derive(serde::Deserialize)]
struct DoModelList {
data: Vec<DoModel>,
@ -416,4 +367,22 @@ mod tests {
// none → original order, despite gpt-4o-mini being cheaper
assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
}
#[test]
fn test_rank_by_ascending_metric_nan_treated_as_missing() {
let models = vec![
"a".to_string(),
"b".to_string(),
"c".to_string(),
"d".to_string(),
];
let mut data = HashMap::new();
data.insert("a".to_string(), f64::NAN);
data.insert("b".to_string(), 0.5);
data.insert("c".to_string(), 0.1);
// "d" has no entry at all
let result = rank_by_ascending_metric(&models, &data);
// c (0.1) < b (0.5), then NaN "a" and missing "d" appended in original order
assert_eq!(result, vec!["c", "b", "a", "d"]);
}
}

View file

@ -104,16 +104,17 @@ pub enum StateStorageType {
Postgres,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum SelectionPreference {
Cheapest,
Fastest,
/// Return models in the same order they were defined — no reordering.
#[default]
None,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct SelectionPolicy {
pub prefer: SelectionPreference,
}
@ -123,36 +124,44 @@ pub struct TopLevelRoutingPreference {
pub name: String,
pub description: String,
pub models: Vec<String>,
#[serde(default)]
pub selection_policy: SelectionPolicy,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricsAuth {
#[serde(rename = "type")]
pub auth_type: String, // only "bearer" supported
pub token: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum MetricsSource {
CostMetrics {
url: String,
refresh_interval: Option<u64>,
auth: Option<MetricsAuth>,
},
PrometheusMetrics {
url: String,
query: String,
refresh_interval: Option<u64>,
},
#[serde(rename = "digitalocean_pricing")]
DigitalOceanPricing {
refresh_interval: Option<u64>,
/// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
/// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
model_aliases: Option<HashMap<String, String>>,
},
Cost(CostMetricsConfig),
Latency(LatencyMetricsConfig),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CostMetricsConfig {
pub provider: CostProvider,
pub refresh_interval: Option<u64>,
/// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
/// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
pub model_aliases: Option<HashMap<String, String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum CostProvider {
Digitalocean,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyMetricsConfig {
pub provider: LatencyProvider,
pub url: String,
pub query: String,
pub refresh_interval: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum LatencyProvider {
Prometheus,
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View file

@ -13,7 +13,6 @@ Plano is an AI-native proxy and data plane for agentic apps — with built-in or
- **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover
- **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request
- **Cost & latency ranking** — models are ranked by live cost (DigitalOcean pricing API) or latency (Prometheus) before returning the fallback list
- **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code
- **Runs anywhere** — single binary; self-host the router for full data privacy
@ -30,38 +29,24 @@ routing_preferences:
models:
- openai/gpt-4o
- openai/gpt-4o-mini
selection_policy:
prefer: cheapest # rank by live cost data
- name: code_generation
description: generating new code, writing functions, or creating boilerplate
models:
- anthropic/claude-sonnet-4-20250514
- openai/gpt-4o
selection_policy:
prefer: fastest # rank by Prometheus p95 latency
```
### `selection_policy.prefer` values
| Value | Behavior |
|---|---|
| `cheapest` | Sort models by ascending cost. Requires `cost_metrics` or `digitalocean_pricing` in `model_metrics_sources`. |
| `fastest` | Sort models by ascending P95 latency. Requires `prometheus_metrics` in `model_metrics_sources`. |
| `random` | Shuffle the model list on each request. |
| `none` | Return models in definition order — no reordering. |
When a request arrives, Plano:
1. Sends the conversation + route descriptions to Arch-Router for intent classification
2. Looks up the matched route and ranks its candidate models by cost or latency
2. Looks up the matched route and returns its candidate models
3. Returns an ordered list — client uses `models[0]`, falls back to `models[1]` on 429/5xx
```
1. Request arrives → "Write binary search in Python"
2. Arch-Router classifies → route: "code_generation"
3. Rank by latency → claude-sonnet (0.85s) < gpt-4o (1.2s)
4. Response → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"]
3. Response → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"]
```
No match? Arch-Router returns `null` route → client falls back to the model in the original request.
@ -77,28 +62,12 @@ export OPENAI_API_KEY=<your-key>
export ANTHROPIC_API_KEY=<your-key>
```
Start Prometheus and the mock latency metrics server:
Start Plano:
```bash
cd demos/llm_routing/model_routing_service
docker compose up -d
planoai up demos/llm_routing/model_routing_service/config.yaml
```
Then start Plano:
```bash
planoai up config.yaml
```
On startup you should see logs like:
```
fetched digitalocean pricing: N models
fetched prometheus latency metrics: 3 models
```
If a model in `routing_preferences` has no matching pricing or latency data, Plano logs a warning at startup — the model is still included but ranked last.
## Run the demo
```bash
@ -135,59 +104,7 @@ Response:
}
```
The response contains the ranked model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors.
## Metrics Sources
### DigitalOcean Pricing (`digitalocean_pricing`)
Fetches public model pricing from the DigitalOcean Gen-AI catalog (no auth required). Model IDs are normalized as `lowercase(creator)/model_id`. Cost scalar = `input_price_per_million + output_price_per_million`.
```yaml
model_metrics_sources:
- type: digitalocean_pricing
refresh_interval: 3600 # re-fetch every hour
```
### Prometheus Latency (`prometheus_metrics`)
Queries a Prometheus instance for P95 latency. The PromQL expression must return an instant vector with a `model_name` label matching the model names in `routing_preferences`.
```yaml
model_metrics_sources:
- type: prometheus_metrics
url: http://localhost:9090
query: model_latency_p95_seconds
refresh_interval: 60
```
The demo's `metrics_server.py` exposes mock latency data; `docker compose up -d` starts it alongside Prometheus.
### Custom Cost Endpoint (`cost_metrics`)
```yaml
model_metrics_sources:
- type: cost_metrics
url: https://my-internal-pricing-api/costs
auth:
type: bearer
token: $PRICING_TOKEN
refresh_interval: 300
```
Expected response format:
```json
{
"anthropic/claude-sonnet-4-20250514": {
"input_per_million": 3.0,
"output_per_million": 15.0
},
"openai/gpt-4o": {
"input_per_million": 5.0,
"output_per_million": 20.0
}
}
```
The response contains the model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors.
## Kubernetes Deployment (Self-hosted Arch-Router on GPU)

View file

@ -22,32 +22,9 @@ routing_preferences:
models:
- openai/gpt-4o
- openai/gpt-4o-mini
selection_policy:
prefer: cheapest
- name: code_generation
description: generating new code, writing functions, or creating boilerplate
models:
- anthropic/claude-sonnet-4-20250514
- openai/gpt-4o
selection_policy:
prefer: fastest
model_metrics_sources:
- type: digitalocean_pricing
refresh_interval: 3600
model_aliases:
openai-gpt-4o: openai/gpt-4o
openai-gpt-4o-mini: openai/gpt-4o-mini
anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
# Use cost_metrics instead of digitalocean_pricing to supply your own pricing data.
# The demo metrics_server.py exposes /costs with OpenAI and Anthropic pricing.
# - type: cost_metrics
# url: http://localhost:8080/costs
# refresh_interval: 300
- type: prometheus_metrics
url: http://localhost:9090
query: model_latency_p95_seconds
refresh_interval: 60

View file

@ -21,14 +21,12 @@ POST /v1/chat/completions
{
"name": "code generation",
"description": "generating new code snippets",
"models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"],
"selection_policy": {"prefer": "fastest"}
"models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"]
},
{
"name": "general questions",
"description": "casual conversation and simple queries",
"models": ["openai/gpt-4o-mini"],
"selection_policy": {"prefer": "cheapest"}
"models": ["openai/gpt-4o-mini"]
}
]
}
@ -41,15 +39,6 @@ POST /v1/chat/completions
| `name` | string | yes | Route identifier. Must match the LLM router's route classification. |
| `description` | string | yes | Natural language description used by the router to match user intent. |
| `models` | string[] | yes | Ordered candidate pool. At least one entry required. Must be declared in `model_providers`. |
| `selection_policy.prefer` | enum | yes | How to rank models: `cheapest`, `fastest`, or `none`. |
### `selection_policy.prefer` values
| Value | Behavior |
|---|---|
| `cheapest` | Sort by ascending cost from the metrics endpoint. Models with no data appended last. |
| `fastest` | Sort by ascending latency from the metrics endpoint. Models with no data appended last. |
| `none` | Return models in the order they were defined — no reordering. |
### Notes
@ -121,120 +110,14 @@ routing_preferences:
models:
- anthropic/claude-sonnet-4-20250514
- openai/gpt-4o
selection_policy:
prefer: fastest
- name: general questions
description: casual conversation and simple queries
models:
- openai/gpt-4o-mini
- openai/gpt-4o
selection_policy:
prefer: cheapest
# Optional: live cost and latency data sources (max one per type)
model_metrics_sources:
# Option A: DigitalOcean public pricing (no auth required)
- type: digitalocean_pricing
refresh_interval: 3600
# Option B: custom cost endpoint (mutually exclusive with digitalocean_pricing)
# - type: cost_metrics
# url: https://internal-cost-api/models
# refresh_interval: 300 # seconds; omit for fetch-once on startup
# auth:
# type: bearer
# token: $COST_API_TOKEN
- type: prometheus_metrics
url: https://internal-prometheus/
query: histogram_quantile(0.95, sum by (model_name, le) (rate(model_latency_seconds_bucket[5m])))
refresh_interval: 60
```
### Startup validation
Plano validates metric source configuration at startup and exits with a clear error if:
| Condition | Error |
|---|---|
| `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost data source — add cost_metrics or digitalocean_pricing` |
| `prefer: fastest` with no `prometheus_metrics` | `prefer: fastest requires a prometheus_metrics source` |
| Two `cost_metrics` entries | `only one cost_metrics source is allowed` |
| Two `prometheus_metrics` entries | `only one prometheus_metrics source is allowed` |
| Two `digitalocean_pricing` entries | `only one digitalocean_pricing source is allowed` |
| `cost_metrics` and `digitalocean_pricing` both present | `cannot both be configured — use one or the other` |
If a model listed in `routing_preferences` has no matching entry in the fetched pricing or latency data, Plano logs a `WARN` at startup — the model is still included but ranked last. The same warning is also emitted per routing request when a model has no data in cache at decision time (relevant for inline `routing_preferences` overrides that reference models not covered by the configured metrics sources).
### cost_metrics endpoint
Plano GETs `url` on startup (and on each `refresh_interval`). Expected response — a JSON object mapping model name to an object with `input_per_million` and `output_per_million` fields:
```json
{
"anthropic/claude-sonnet-4-20250514": {
"input_per_million": 3.0,
"output_per_million": 15.0
},
"openai/gpt-4o": {
"input_per_million": 5.0,
"output_per_million": 20.0
},
"openai/gpt-4o-mini": {
"input_per_million": 0.15,
"output_per_million": 0.6
}
}
```
- `auth.type: bearer` adds `Authorization: Bearer <token>` to the request
- Plano combines the two fields as `input_per_million + output_per_million` to produce a single cost scalar used for ranking
- Only relative order matters — the unit (e.g. USD per million tokens) is consistent so ranking is correct
### digitalocean_pricing source
Fetches public model pricing from the DigitalOcean Gen-AI catalog. No authentication required.
```yaml
model_metrics_sources:
- type: digitalocean_pricing
refresh_interval: 3600 # re-fetch every hour; omit to fetch once on startup
model_aliases:
openai-gpt-4o: openai/gpt-4o
openai-gpt-4o-mini: openai/gpt-4o-mini
anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
```
DO catalog entries are stored by their `model_id` field (e.g. `openai-gpt-4o`). The cost scalar is `input_price_per_million + output_price_per_million`.
**`model_aliases`** — optional. Maps DO `model_id` values to the model names used in `routing_preferences`. Without aliases, cost data is stored under the DO model_id (e.g. `openai-gpt-4o`), which won't match models configured as `openai/gpt-4o`. Aliases let you bridge the naming gap without changing your routing config.
**Constraints:**
- `cost_metrics` and `digitalocean_pricing` cannot both be configured — use one or the other.
- Only one `digitalocean_pricing` entry is allowed.
### prometheus_metrics endpoint
Plano queries `{url}/api/v1/query?query={query}` on startup and each `refresh_interval`. The PromQL expression must return an instant vector with a `model_name` label:
```json
{
"status": "success",
"data": {
"resultType": "vector",
"result": [
{"metric": {"model_name": "anthropic/claude-sonnet-4-20250514"}, "value": [1234567890, "120.5"]},
{"metric": {"model_name": "openai/gpt-4o"}, "value": [1234567890, "200.3"]}
]
}
}
```
- The PromQL query is responsible for computing the percentile (e.g. `histogram_quantile(0.95, ...)`)
- Latency units are arbitrary — only relative order matters
- Models missing from the result are appended at the end of the ranked list
---
## Version Requirements