From af98c11a6dbd7d7434f358a88b518804205d93f8 Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Mon, 30 Mar 2026 17:12:20 -0700 Subject: [PATCH] restructure model_metrics_sources to type + provider (#855) --- config/plano_config_schema.yaml | 74 +++---- crates/brightstaff/src/main.rs | 49 ++--- .../brightstaff/src/router/model_metrics.rs | 205 ++++++++---------- crates/common/src/configuration.rs | 61 +++--- .../model_routing_service/README.md | 93 +------- .../model_routing_service/config.yaml | 23 -- docs/routing-api.md | 121 +---------- 7 files changed, 171 insertions(+), 455 deletions(-) diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index 6ee28a83..a2c34c91 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -502,7 +502,6 @@ properties: - name - description - models - - selection_policy model_metrics_sources: type: array @@ -512,52 +511,11 @@ properties: properties: type: type: string - const: cost_metrics - url: + const: cost + provider: type: string - refresh_interval: - type: integer - minimum: 1 - auth: - type: object - properties: - type: - type: string - enum: - - bearer - token: - type: string - required: - - type - - token - additionalProperties: false - required: - - type - - url - additionalProperties: false - - type: object - properties: - type: - type: string - const: prometheus_metrics - url: - type: string - query: - type: string - refresh_interval: - type: integer - minimum: 1 - description: "Refresh interval in seconds" - required: - - type - - url - - query - additionalProperties: false - - type: object - properties: - type: - type: string - const: digitalocean_pricing + enum: + - digitalocean refresh_interval: type: integer minimum: 1 @@ -569,6 +527,30 @@ properties: type: string required: - type + - provider + additionalProperties: false + - type: object + properties: + type: + type: string + const: latency + provider: + type: string + enum: + - prometheus + url: + type: string + query: + type: string + refresh_interval: + type: integer + minimum: 1 + description: "Refresh interval in seconds" + required: + - type + - provider + - url + - query additionalProperties: false additionalProperties: false diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 19ef9efb..bc88b60b 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -216,33 +216,17 @@ async fn init_app_state( use common::configuration::MetricsSource; let cost_count = sources .iter() - .filter(|s| matches!(s, MetricsSource::CostMetrics { .. })) + .filter(|s| matches!(s, MetricsSource::Cost(_))) .count(); - let prom_count = sources + let latency_count = sources .iter() - .filter(|s| matches!(s, MetricsSource::PrometheusMetrics { .. })) - .count(); - let do_count = sources - .iter() - .filter(|s| matches!(s, MetricsSource::DigitalOceanPricing { .. })) + .filter(|s| matches!(s, MetricsSource::Latency(_))) .count(); if cost_count > 1 { - return Err("model_metrics_sources: only one cost_metrics source is allowed".into()); + return Err("model_metrics_sources: only one cost metrics source is allowed".into()); } - if prom_count > 1 { - return Err( - "model_metrics_sources: only one prometheus_metrics source is allowed".into(), - ); - } - if do_count > 1 { - return Err( - "model_metrics_sources: only one digitalocean_pricing source is allowed".into(), - ); - } - if cost_count > 0 && do_count > 0 { - return Err( - "model_metrics_sources: cost_metrics and digitalocean_pricing cannot both be configured — use one or the other".into(), - ); + if latency_count > 1 { + return Err("model_metrics_sources: only one latency metrics source is allowed".into()); } let svc = ModelMetricsService::new(sources, reqwest::Client::new()).await; Some(Arc::new(svc)) @@ -259,32 +243,27 @@ async fn init_app_state( .as_deref() .unwrap_or_default() .iter() - .any(|s| { - matches!( - s, - MetricsSource::CostMetrics { .. } | MetricsSource::DigitalOceanPricing { .. } - ) - }); - let has_prometheus = config + .any(|s| matches!(s, MetricsSource::Cost(_))); + let has_latency_source = config .model_metrics_sources .as_deref() .unwrap_or_default() .iter() - .any(|s| matches!(s, MetricsSource::PrometheusMetrics { .. })); + .any(|s| matches!(s, MetricsSource::Latency(_))); for pref in prefs { if pref.selection_policy.prefer == SelectionPreference::Cheapest && !has_cost_source { return Err(format!( - "routing_preferences route '{}' uses prefer: cheapest but no cost data source is configured — \ - add cost_metrics or digitalocean_pricing to model_metrics_sources", + "routing_preferences route '{}' uses prefer: cheapest but no cost metrics source is configured — \ + add a cost metrics source to model_metrics_sources", pref.name ) .into()); } - if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_prometheus { + if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_latency_source { return Err(format!( - "routing_preferences route '{}' uses prefer: fastest but no prometheus_metrics source is configured — \ - add prometheus_metrics to model_metrics_sources", + "routing_preferences route '{}' uses prefer: fastest but no latency metrics source is configured — \ + add a latency metrics source to model_metrics_sources", pref.name ) .into()); diff --git a/crates/brightstaff/src/router/model_metrics.rs b/crates/brightstaff/src/router/model_metrics.rs index 2ad40254..1adb408d 100644 --- a/crates/brightstaff/src/router/model_metrics.rs +++ b/crates/brightstaff/src/router/model_metrics.rs @@ -2,9 +2,11 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use common::configuration::{MetricsSource, SelectionPolicy, SelectionPreference}; +use common::configuration::{ + CostProvider, LatencyProvider, MetricsSource, SelectionPolicy, SelectionPreference, +}; use tokio::sync::RwLock; -use tracing::{info, warn}; +use tracing::{debug, info, warn}; const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog"; @@ -20,81 +22,52 @@ impl ModelMetricsService { for source in sources { match source { - MetricsSource::CostMetrics { - url, - refresh_interval, - auth, - } => { - let data = fetch_cost_metrics(url, auth.as_ref(), &client).await; - info!(models = data.len(), url = %url, "fetched cost metrics"); - *cost_data.write().await = data; + MetricsSource::Cost(cfg) => match cfg.provider { + CostProvider::Digitalocean => { + let aliases = cfg.model_aliases.clone().unwrap_or_default(); + let data = fetch_do_pricing(&client, &aliases).await; + info!(models = data.len(), "fetched digitalocean pricing"); + *cost_data.write().await = data; - if let Some(interval_secs) = refresh_interval { - let cost_clone = Arc::clone(&cost_data); - let client_clone = client.clone(); - let url = url.clone(); - let auth = auth.clone(); - let interval = Duration::from_secs(*interval_secs); - tokio::spawn(async move { - loop { - tokio::time::sleep(interval).await; - let data = - fetch_cost_metrics(&url, auth.as_ref(), &client_clone).await; - info!(models = data.len(), url = %url, "refreshed cost metrics"); - *cost_clone.write().await = data; - } - }); + if let Some(interval_secs) = cfg.refresh_interval { + let cost_clone = Arc::clone(&cost_data); + let client_clone = client.clone(); + let interval = Duration::from_secs(interval_secs); + tokio::spawn(async move { + loop { + tokio::time::sleep(interval).await; + let data = fetch_do_pricing(&client_clone, &aliases).await; + info!(models = data.len(), "refreshed digitalocean pricing"); + *cost_clone.write().await = data; + } + }); + } } - } - MetricsSource::PrometheusMetrics { - url, - query, - refresh_interval, - } => { - let data = fetch_prometheus_metrics(url, query, &client).await; - info!(models = data.len(), url = %url, "fetched prometheus latency metrics"); - *latency_data.write().await = data; + }, + MetricsSource::Latency(cfg) => match cfg.provider { + LatencyProvider::Prometheus => { + let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await; + info!(models = data.len(), url = %cfg.url, "fetched latency metrics"); + *latency_data.write().await = data; - if let Some(interval_secs) = refresh_interval { - let latency_clone = Arc::clone(&latency_data); - let client_clone = client.clone(); - let url = url.clone(); - let query = query.clone(); - let interval = Duration::from_secs(*interval_secs); - tokio::spawn(async move { - loop { - tokio::time::sleep(interval).await; - let data = - fetch_prometheus_metrics(&url, &query, &client_clone).await; - info!(models = data.len(), url = %url, "refreshed prometheus latency metrics"); - *latency_clone.write().await = data; - } - }); + if let Some(interval_secs) = cfg.refresh_interval { + let latency_clone = Arc::clone(&latency_data); + let client_clone = client.clone(); + let url = cfg.url.clone(); + let query = cfg.query.clone(); + let interval = Duration::from_secs(interval_secs); + tokio::spawn(async move { + loop { + tokio::time::sleep(interval).await; + let data = + fetch_prometheus_metrics(&url, &query, &client_clone).await; + info!(models = data.len(), url = %url, "refreshed latency metrics"); + *latency_clone.write().await = data; + } + }); + } } - } - MetricsSource::DigitalOceanPricing { - refresh_interval, - model_aliases, - } => { - let aliases = model_aliases.clone().unwrap_or_default(); - let data = fetch_do_pricing(&client, &aliases).await; - info!(models = data.len(), "fetched digitalocean pricing"); - *cost_data.write().await = data; - - if let Some(interval_secs) = refresh_interval { - let cost_clone = Arc::clone(&cost_data); - let client_clone = client.clone(); - let interval = Duration::from_secs(*interval_secs); - tokio::spawn(async move { - loop { - tokio::time::sleep(interval).await; - let data = fetch_do_pricing(&client_clone, &aliases).await; - info!(models = data.len(), "refreshed digitalocean pricing"); - *cost_clone.write().await = data; - } - }); - } - } + }, } } @@ -107,24 +80,32 @@ impl ModelMetricsService { /// Rank `models` by `policy`, returning them in preference order. /// Models with no metric data are appended at the end in their original order. pub async fn rank_models(&self, models: &[String], policy: &SelectionPolicy) -> Vec { + let cost_data = self.cost.read().await; + let latency_data = self.latency.read().await; + debug!( + input_models = ?models, + cost_data = ?cost_data.iter().collect::>(), + latency_data = ?latency_data.iter().collect::>(), + prefer = ?policy.prefer, + "rank_models called" + ); + match policy.prefer { SelectionPreference::Cheapest => { - let data = self.cost.read().await; for m in models { - if !data.contains_key(m.as_str()) { + if !cost_data.contains_key(m.as_str()) { warn!(model = %m, "no cost data for model — ranking last (prefer: cheapest)"); } } - rank_by_ascending_metric(models, &data) + rank_by_ascending_metric(models, &cost_data) } SelectionPreference::Fastest => { - let data = self.latency.read().await; for m in models { - if !data.contains_key(m.as_str()) { + if !latency_data.contains_key(m.as_str()) { warn!(model = %m, "no latency data for model — ranking last (prefer: fastest)"); } } - rank_by_ascending_metric(models, &data) + rank_by_ascending_metric(models, &latency_data) } SelectionPreference::None => models.to_vec(), } @@ -144,13 +125,20 @@ impl ModelMetricsService { fn rank_by_ascending_metric(models: &[String], data: &HashMap) -> Vec { let mut with_data: Vec<(&String, f64)> = models .iter() - .filter_map(|m| data.get(m.as_str()).map(|v| (m, *v))) + .filter_map(|m| { + let v = *data.get(m.as_str())?; + if v.is_nan() { + None + } else { + Some((m, v)) + } + }) .collect(); with_data.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); let without_data: Vec<&String> = models .iter() - .filter(|m| !data.contains_key(m.as_str())) + .filter(|m| data.get(m.as_str()).is_none_or(|v| v.is_nan())) .collect(); with_data @@ -160,43 +148,6 @@ fn rank_by_ascending_metric(models: &[String], data: &HashMap) -> V .collect() } -#[derive(serde::Deserialize)] -struct CostEntry { - input_per_million: f64, - output_per_million: f64, -} - -async fn fetch_cost_metrics( - url: &str, - auth: Option<&common::configuration::MetricsAuth>, - client: &reqwest::Client, -) -> HashMap { - let mut req = client.get(url); - if let Some(auth) = auth { - if auth.auth_type == "bearer" { - req = req.header("Authorization", format!("Bearer {}", auth.token)); - } else { - warn!(auth_type = %auth.auth_type, "unsupported auth type for cost_metrics, skipping auth"); - } - } - match req.send().await { - Ok(resp) => match resp.json::>().await { - Ok(data) => data - .into_iter() - .map(|(k, v)| (k, v.input_per_million + v.output_per_million)) - .collect(), - Err(err) => { - warn!(error = %err, url = %url, "failed to parse cost metrics response"); - HashMap::new() - } - }, - Err(err) => { - warn!(error = %err, url = %url, "failed to fetch cost metrics"); - HashMap::new() - } - } -} - #[derive(serde::Deserialize)] struct DoModelList { data: Vec, @@ -416,4 +367,22 @@ mod tests { // none → original order, despite gpt-4o-mini being cheaper assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]); } + + #[test] + fn test_rank_by_ascending_metric_nan_treated_as_missing() { + let models = vec![ + "a".to_string(), + "b".to_string(), + "c".to_string(), + "d".to_string(), + ]; + let mut data = HashMap::new(); + data.insert("a".to_string(), f64::NAN); + data.insert("b".to_string(), 0.5); + data.insert("c".to_string(), 0.1); + // "d" has no entry at all + let result = rank_by_ascending_metric(&models, &data); + // c (0.1) < b (0.5), then NaN "a" and missing "d" appended in original order + assert_eq!(result, vec!["c", "b", "a", "d"]); + } } diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index cd074a8b..653f4d8c 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -104,16 +104,17 @@ pub enum StateStorageType { Postgres, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "lowercase")] pub enum SelectionPreference { Cheapest, Fastest, /// Return models in the same order they were defined — no reordering. + #[default] None, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct SelectionPolicy { pub prefer: SelectionPreference, } @@ -123,36 +124,44 @@ pub struct TopLevelRoutingPreference { pub name: String, pub description: String, pub models: Vec, + #[serde(default)] pub selection_policy: SelectionPolicy, } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricsAuth { - #[serde(rename = "type")] - pub auth_type: String, // only "bearer" supported - pub token: String, -} - #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] pub enum MetricsSource { - CostMetrics { - url: String, - refresh_interval: Option, - auth: Option, - }, - PrometheusMetrics { - url: String, - query: String, - refresh_interval: Option, - }, - #[serde(rename = "digitalocean_pricing")] - DigitalOceanPricing { - refresh_interval: Option, - /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names. - /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o` - model_aliases: Option>, - }, + Cost(CostMetricsConfig), + Latency(LatencyMetricsConfig), +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CostMetricsConfig { + pub provider: CostProvider, + pub refresh_interval: Option, + /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names. + /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o` + pub model_aliases: Option>, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum CostProvider { + Digitalocean, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LatencyMetricsConfig { + pub provider: LatencyProvider, + pub url: String, + pub query: String, + pub refresh_interval: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum LatencyProvider { + Prometheus, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md index 85306c3b..e8c8a995 100644 --- a/demos/llm_routing/model_routing_service/README.md +++ b/demos/llm_routing/model_routing_service/README.md @@ -13,7 +13,6 @@ Plano is an AI-native proxy and data plane for agentic apps — with built-in or - **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover - **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request -- **Cost & latency ranking** — models are ranked by live cost (DigitalOcean pricing API) or latency (Prometheus) before returning the fallback list - **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code - **Runs anywhere** — single binary; self-host the router for full data privacy @@ -30,38 +29,24 @@ routing_preferences: models: - openai/gpt-4o - openai/gpt-4o-mini - selection_policy: - prefer: cheapest # rank by live cost data - name: code_generation description: generating new code, writing functions, or creating boilerplate models: - anthropic/claude-sonnet-4-20250514 - openai/gpt-4o - selection_policy: - prefer: fastest # rank by Prometheus p95 latency ``` -### `selection_policy.prefer` values - -| Value | Behavior | -|---|---| -| `cheapest` | Sort models by ascending cost. Requires `cost_metrics` or `digitalocean_pricing` in `model_metrics_sources`. | -| `fastest` | Sort models by ascending P95 latency. Requires `prometheus_metrics` in `model_metrics_sources`. | -| `random` | Shuffle the model list on each request. | -| `none` | Return models in definition order — no reordering. | - When a request arrives, Plano: 1. Sends the conversation + route descriptions to Arch-Router for intent classification -2. Looks up the matched route and ranks its candidate models by cost or latency +2. Looks up the matched route and returns its candidate models 3. Returns an ordered list — client uses `models[0]`, falls back to `models[1]` on 429/5xx ``` 1. Request arrives → "Write binary search in Python" 2. Arch-Router classifies → route: "code_generation" -3. Rank by latency → claude-sonnet (0.85s) < gpt-4o (1.2s) -4. Response → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"] +3. Response → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"] ``` No match? Arch-Router returns `null` route → client falls back to the model in the original request. @@ -77,28 +62,12 @@ export OPENAI_API_KEY= export ANTHROPIC_API_KEY= ``` -Start Prometheus and the mock latency metrics server: +Start Plano: ```bash -cd demos/llm_routing/model_routing_service -docker compose up -d +planoai up demos/llm_routing/model_routing_service/config.yaml ``` -Then start Plano: - -```bash -planoai up config.yaml -``` - -On startup you should see logs like: - -``` -fetched digitalocean pricing: N models -fetched prometheus latency metrics: 3 models -``` - -If a model in `routing_preferences` has no matching pricing or latency data, Plano logs a warning at startup — the model is still included but ranked last. - ## Run the demo ```bash @@ -135,59 +104,7 @@ Response: } ``` -The response contains the ranked model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors. - -## Metrics Sources - -### DigitalOcean Pricing (`digitalocean_pricing`) - -Fetches public model pricing from the DigitalOcean Gen-AI catalog (no auth required). Model IDs are normalized as `lowercase(creator)/model_id`. Cost scalar = `input_price_per_million + output_price_per_million`. - -```yaml -model_metrics_sources: - - type: digitalocean_pricing - refresh_interval: 3600 # re-fetch every hour -``` - -### Prometheus Latency (`prometheus_metrics`) - -Queries a Prometheus instance for P95 latency. The PromQL expression must return an instant vector with a `model_name` label matching the model names in `routing_preferences`. - -```yaml -model_metrics_sources: - - type: prometheus_metrics - url: http://localhost:9090 - query: model_latency_p95_seconds - refresh_interval: 60 -``` - -The demo's `metrics_server.py` exposes mock latency data; `docker compose up -d` starts it alongside Prometheus. - -### Custom Cost Endpoint (`cost_metrics`) - -```yaml -model_metrics_sources: - - type: cost_metrics - url: https://my-internal-pricing-api/costs - auth: - type: bearer - token: $PRICING_TOKEN - refresh_interval: 300 -``` - -Expected response format: -```json -{ - "anthropic/claude-sonnet-4-20250514": { - "input_per_million": 3.0, - "output_per_million": 15.0 - }, - "openai/gpt-4o": { - "input_per_million": 5.0, - "output_per_million": 20.0 - } -} -``` +The response contains the model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors. ## Kubernetes Deployment (Self-hosted Arch-Router on GPU) diff --git a/demos/llm_routing/model_routing_service/config.yaml b/demos/llm_routing/model_routing_service/config.yaml index 6f20134b..0bcf658d 100644 --- a/demos/llm_routing/model_routing_service/config.yaml +++ b/demos/llm_routing/model_routing_service/config.yaml @@ -22,32 +22,9 @@ routing_preferences: models: - openai/gpt-4o - openai/gpt-4o-mini - selection_policy: - prefer: cheapest - name: code_generation description: generating new code, writing functions, or creating boilerplate models: - anthropic/claude-sonnet-4-20250514 - openai/gpt-4o - selection_policy: - prefer: fastest - -model_metrics_sources: - - type: digitalocean_pricing - refresh_interval: 3600 - model_aliases: - openai-gpt-4o: openai/gpt-4o - openai-gpt-4o-mini: openai/gpt-4o-mini - anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514 - - # Use cost_metrics instead of digitalocean_pricing to supply your own pricing data. - # The demo metrics_server.py exposes /costs with OpenAI and Anthropic pricing. - # - type: cost_metrics - # url: http://localhost:8080/costs - # refresh_interval: 300 - - - type: prometheus_metrics - url: http://localhost:9090 - query: model_latency_p95_seconds - refresh_interval: 60 diff --git a/docs/routing-api.md b/docs/routing-api.md index f3ed0552..0b30d627 100644 --- a/docs/routing-api.md +++ b/docs/routing-api.md @@ -21,14 +21,12 @@ POST /v1/chat/completions { "name": "code generation", "description": "generating new code snippets", - "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"], - "selection_policy": {"prefer": "fastest"} + "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"] }, { "name": "general questions", "description": "casual conversation and simple queries", - "models": ["openai/gpt-4o-mini"], - "selection_policy": {"prefer": "cheapest"} + "models": ["openai/gpt-4o-mini"] } ] } @@ -41,15 +39,6 @@ POST /v1/chat/completions | `name` | string | yes | Route identifier. Must match the LLM router's route classification. | | `description` | string | yes | Natural language description used by the router to match user intent. | | `models` | string[] | yes | Ordered candidate pool. At least one entry required. Must be declared in `model_providers`. | -| `selection_policy.prefer` | enum | yes | How to rank models: `cheapest`, `fastest`, or `none`. | - -### `selection_policy.prefer` values - -| Value | Behavior | -|---|---| -| `cheapest` | Sort by ascending cost from the metrics endpoint. Models with no data appended last. | -| `fastest` | Sort by ascending latency from the metrics endpoint. Models with no data appended last. | -| `none` | Return models in the order they were defined — no reordering. | ### Notes @@ -121,120 +110,14 @@ routing_preferences: models: - anthropic/claude-sonnet-4-20250514 - openai/gpt-4o - selection_policy: - prefer: fastest - name: general questions description: casual conversation and simple queries models: - openai/gpt-4o-mini - openai/gpt-4o - selection_policy: - prefer: cheapest - -# Optional: live cost and latency data sources (max one per type) -model_metrics_sources: - # Option A: DigitalOcean public pricing (no auth required) - - type: digitalocean_pricing - refresh_interval: 3600 - - # Option B: custom cost endpoint (mutually exclusive with digitalocean_pricing) - # - type: cost_metrics - # url: https://internal-cost-api/models - # refresh_interval: 300 # seconds; omit for fetch-once on startup - # auth: - # type: bearer - # token: $COST_API_TOKEN - - - type: prometheus_metrics - url: https://internal-prometheus/ - query: histogram_quantile(0.95, sum by (model_name, le) (rate(model_latency_seconds_bucket[5m]))) - refresh_interval: 60 ``` -### Startup validation - -Plano validates metric source configuration at startup and exits with a clear error if: - -| Condition | Error | -|---|---| -| `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost data source — add cost_metrics or digitalocean_pricing` | -| `prefer: fastest` with no `prometheus_metrics` | `prefer: fastest requires a prometheus_metrics source` | -| Two `cost_metrics` entries | `only one cost_metrics source is allowed` | -| Two `prometheus_metrics` entries | `only one prometheus_metrics source is allowed` | -| Two `digitalocean_pricing` entries | `only one digitalocean_pricing source is allowed` | -| `cost_metrics` and `digitalocean_pricing` both present | `cannot both be configured — use one or the other` | - -If a model listed in `routing_preferences` has no matching entry in the fetched pricing or latency data, Plano logs a `WARN` at startup — the model is still included but ranked last. The same warning is also emitted per routing request when a model has no data in cache at decision time (relevant for inline `routing_preferences` overrides that reference models not covered by the configured metrics sources). - -### cost_metrics endpoint - -Plano GETs `url` on startup (and on each `refresh_interval`). Expected response — a JSON object mapping model name to an object with `input_per_million` and `output_per_million` fields: - -```json -{ - "anthropic/claude-sonnet-4-20250514": { - "input_per_million": 3.0, - "output_per_million": 15.0 - }, - "openai/gpt-4o": { - "input_per_million": 5.0, - "output_per_million": 20.0 - }, - "openai/gpt-4o-mini": { - "input_per_million": 0.15, - "output_per_million": 0.6 - } -} -``` - -- `auth.type: bearer` adds `Authorization: Bearer ` to the request -- Plano combines the two fields as `input_per_million + output_per_million` to produce a single cost scalar used for ranking -- Only relative order matters — the unit (e.g. USD per million tokens) is consistent so ranking is correct - -### digitalocean_pricing source - -Fetches public model pricing from the DigitalOcean Gen-AI catalog. No authentication required. - -```yaml -model_metrics_sources: - - type: digitalocean_pricing - refresh_interval: 3600 # re-fetch every hour; omit to fetch once on startup - model_aliases: - openai-gpt-4o: openai/gpt-4o - openai-gpt-4o-mini: openai/gpt-4o-mini - anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514 -``` - -DO catalog entries are stored by their `model_id` field (e.g. `openai-gpt-4o`). The cost scalar is `input_price_per_million + output_price_per_million`. - -**`model_aliases`** — optional. Maps DO `model_id` values to the model names used in `routing_preferences`. Without aliases, cost data is stored under the DO model_id (e.g. `openai-gpt-4o`), which won't match models configured as `openai/gpt-4o`. Aliases let you bridge the naming gap without changing your routing config. - -**Constraints:** -- `cost_metrics` and `digitalocean_pricing` cannot both be configured — use one or the other. -- Only one `digitalocean_pricing` entry is allowed. - -### prometheus_metrics endpoint - -Plano queries `{url}/api/v1/query?query={query}` on startup and each `refresh_interval`. The PromQL expression must return an instant vector with a `model_name` label: - -```json -{ - "status": "success", - "data": { - "resultType": "vector", - "result": [ - {"metric": {"model_name": "anthropic/claude-sonnet-4-20250514"}, "value": [1234567890, "120.5"]}, - {"metric": {"model_name": "openai/gpt-4o"}, "value": [1234567890, "200.3"]} - ] - } -} -``` - -- The PromQL query is responsible for computing the percentile (e.g. `histogram_quantile(0.95, ...)`) -- Latency units are arbitrary — only relative order matters -- Models missing from the result are appended at the end of the ranked list - --- ## Version Requirements