Merge branch 'main' into adil/release-0.4.15

2026-04-25 00:36:34 +02:00 · 2026-03-30 17:22:34 -07:00 · 2026-03-30 17:22:34 -07:00 · 21aa91551d
commit 21aa91551d
parent 0a2e5a1be3 af98c11a6d
7 changed files with 171 additions and 455 deletions
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@ -502,7 +502,6 @@ properties:
        - name
        - description
        - models
        - selection_policy
  model_metrics_sources:
    type: array
@ -512,52 +511,11 @@ properties:
          properties:
            type:
              type: string
-              const: cost_metrics
+              const: cost
-            url:
+            provider:
              type: string
-            refresh_interval:
+              enum:
-              type: integer
+                - digitalocean
              minimum: 1
            auth:
              type: object
              properties:
                type:
                  type: string
                  enum:
                    - bearer
                token:
                  type: string
              required:
                - type
                - token
              additionalProperties: false
          required:
            - type
            - url
          additionalProperties: false
        - type: object
          properties:
            type:
              type: string
              const: prometheus_metrics
            url:
              type: string
            query:
              type: string
            refresh_interval:
              type: integer
              minimum: 1
              description: "Refresh interval in seconds"
          required:
            - type
            - url
            - query
          additionalProperties: false
        - type: object
          properties:
            type:
              type: string
              const: digitalocean_pricing
            refresh_interval:
              type: integer
              minimum: 1
@ -569,6 +527,30 @@ properties:
                type: string
          required:
            - type
            - provider
          additionalProperties: false
        - type: object
          properties:
            type:
              type: string
              const: latency
            provider:
              type: string
              enum:
                - prometheus
            url:
              type: string
            query:
              type: string
            refresh_interval:
              type: integer
              minimum: 1
              description: "Refresh interval in seconds"
          required:
            - type
            - provider
            - url
            - query
          additionalProperties: false
 additionalProperties: false
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -216,33 +216,17 @@ async fn init_app_state(
        use common::configuration::MetricsSource;
        let cost_count = sources
            .iter()
-            .filter(|s| matches!(s, MetricsSource::CostMetrics { .. }))
+            .filter(|s| matches!(s, MetricsSource::Cost(_)))
            .count();
-        let prom_count = sources
+        let latency_count = sources
            .iter()
-            .filter(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }))
+            .filter(|s| matches!(s, MetricsSource::Latency(_)))
            .count();
        let do_count = sources
            .iter()
            .filter(|s| matches!(s, MetricsSource::DigitalOceanPricing { .. }))
            .count();
        if cost_count > 1 {
-            return Err("model_metrics_sources: only one cost_metrics source is allowed".into());
+            return Err("model_metrics_sources: only one cost metrics source is allowed".into());
        }
-        if prom_count > 1 {
+        if latency_count > 1 {
-            return Err(
+            return Err("model_metrics_sources: only one latency metrics source is allowed".into());
                "model_metrics_sources: only one prometheus_metrics source is allowed".into(),
            );
        }
        if do_count > 1 {
            return Err(
                "model_metrics_sources: only one digitalocean_pricing source is allowed".into(),
            );
        }
        if cost_count > 0 && do_count > 0 {
            return Err(
                "model_metrics_sources: cost_metrics and digitalocean_pricing cannot both be configured — use one or the other".into(),
            );
        }
        let svc = ModelMetricsService::new(sources, reqwest::Client::new()).await;
        Some(Arc::new(svc))
@ -259,32 +243,27 @@ async fn init_app_state(
            .as_deref()
            .unwrap_or_default()
            .iter()
-            .any(|s| {
+            .any(|s| matches!(s, MetricsSource::Cost(_)));
-                matches!(
+        let has_latency_source = config
                    s,
                    MetricsSource::CostMetrics { .. } | MetricsSource::DigitalOceanPricing { .. }
                )
            });
        let has_prometheus = config
            .model_metrics_sources
            .as_deref()
            .unwrap_or_default()
            .iter()
-            .any(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }));
+            .any(|s| matches!(s, MetricsSource::Latency(_)));
        for pref in prefs {
            if pref.selection_policy.prefer == SelectionPreference::Cheapest && !has_cost_source {
                return Err(format!(
-                    "routing_preferences route '{}' uses prefer: cheapest but no cost data source is configured — \
+                    "routing_preferences route '{}' uses prefer: cheapest but no cost metrics source is configured — \
-                     add cost_metrics or digitalocean_pricing to model_metrics_sources",
+                     add a cost metrics source to model_metrics_sources",
                    pref.name
                )
                .into());
            }
-            if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_prometheus {
+            if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_latency_source {
                return Err(format!(
-                    "routing_preferences route '{}' uses prefer: fastest but no prometheus_metrics source is configured — \
+                    "routing_preferences route '{}' uses prefer: fastest but no latency metrics source is configured — \
-                     add prometheus_metrics to model_metrics_sources",
+                     add a latency metrics source to model_metrics_sources",
                    pref.name
                )
                .into());
--- a/crates/brightstaff/src/router/model_metrics.rs
+++ b/crates/brightstaff/src/router/model_metrics.rs
@ -2,9 +2,11 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;
-use common::configuration::{MetricsSource, SelectionPolicy, SelectionPreference};
+use common::configuration::{
    CostProvider, LatencyProvider, MetricsSource, SelectionPolicy, SelectionPreference,
 };
 use tokio::sync::RwLock;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog";
@ -20,81 +22,52 @@ impl ModelMetricsService {
        for source in sources {
            match source {
-                MetricsSource::CostMetrics {
+                MetricsSource::Cost(cfg) => match cfg.provider {
-                    url,
+                    CostProvider::Digitalocean => {
-                    refresh_interval,
+                        let aliases = cfg.model_aliases.clone().unwrap_or_default();
-                    auth,
+                        let data = fetch_do_pricing(&client, &aliases).await;
-                } => {
+                        info!(models = data.len(), "fetched digitalocean pricing");
-                    let data = fetch_cost_metrics(url, auth.as_ref(), &client).await;
+                        *cost_data.write().await = data;
                    info!(models = data.len(), url = %url, "fetched cost metrics");
                    *cost_data.write().await = data;
-                    if let Some(interval_secs) = refresh_interval {
+                        if let Some(interval_secs) = cfg.refresh_interval {
-                        let cost_clone = Arc::clone(&cost_data);
+                            let cost_clone = Arc::clone(&cost_data);
-                        let client_clone = client.clone();
+                            let client_clone = client.clone();
-                        let url = url.clone();
+                            let interval = Duration::from_secs(interval_secs);
-                        let auth = auth.clone();
+                            tokio::spawn(async move {
-                        let interval = Duration::from_secs(*interval_secs);
+                                loop {
-                        tokio::spawn(async move {
+                                    tokio::time::sleep(interval).await;
-                            loop {
+                                    let data = fetch_do_pricing(&client_clone, &aliases).await;
-                                tokio::time::sleep(interval).await;
+                                    info!(models = data.len(), "refreshed digitalocean pricing");
-                                let data =
+                                    *cost_clone.write().await = data;
-                                    fetch_cost_metrics(&url, auth.as_ref(), &client_clone).await;
+                                }
-                                info!(models = data.len(), url = %url, "refreshed cost metrics");
+                            });
-                                *cost_clone.write().await = data;
+                        }
                            }
                        });
                    }
-                }
+                },
-                MetricsSource::PrometheusMetrics {
+                MetricsSource::Latency(cfg) => match cfg.provider {
-                    url,
+                    LatencyProvider::Prometheus => {
-                    query,
+                        let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await;
-                    refresh_interval,
+                        info!(models = data.len(), url = %cfg.url, "fetched latency metrics");
-                } => {
+                        *latency_data.write().await = data;
                    let data = fetch_prometheus_metrics(url, query, &client).await;
                    info!(models = data.len(), url = %url, "fetched prometheus latency metrics");
                    *latency_data.write().await = data;
-                    if let Some(interval_secs) = refresh_interval {
+                        if let Some(interval_secs) = cfg.refresh_interval {
-                        let latency_clone = Arc::clone(&latency_data);
+                            let latency_clone = Arc::clone(&latency_data);
-                        let client_clone = client.clone();
+                            let client_clone = client.clone();
-                        let url = url.clone();
+                            let url = cfg.url.clone();
-                        let query = query.clone();
+                            let query = cfg.query.clone();
-                        let interval = Duration::from_secs(*interval_secs);
+                            let interval = Duration::from_secs(interval_secs);
-                        tokio::spawn(async move {
+                            tokio::spawn(async move {
-                            loop {
+                                loop {
-                                tokio::time::sleep(interval).await;
+                                    tokio::time::sleep(interval).await;
-                                let data =
+                                    let data =
-                                    fetch_prometheus_metrics(&url, &query, &client_clone).await;
+                                        fetch_prometheus_metrics(&url, &query, &client_clone).await;
-                                info!(models = data.len(), url = %url, "refreshed prometheus latency metrics");
+                                    info!(models = data.len(), url = %url, "refreshed latency metrics");
-                                *latency_clone.write().await = data;
+                                    *latency_clone.write().await = data;
-                            }
+                                }
-                        });
+                            });
                        }
                    }
-                }
+                },
                MetricsSource::DigitalOceanPricing {
                    refresh_interval,
                    model_aliases,
                } => {
                    let aliases = model_aliases.clone().unwrap_or_default();
                    let data = fetch_do_pricing(&client, &aliases).await;
                    info!(models = data.len(), "fetched digitalocean pricing");
                    *cost_data.write().await = data;
                    if let Some(interval_secs) = refresh_interval {
                        let cost_clone = Arc::clone(&cost_data);
                        let client_clone = client.clone();
                        let interval = Duration::from_secs(*interval_secs);
                        tokio::spawn(async move {
                            loop {
                                tokio::time::sleep(interval).await;
                                let data = fetch_do_pricing(&client_clone, &aliases).await;
                                info!(models = data.len(), "refreshed digitalocean pricing");
                                *cost_clone.write().await = data;
                            }
                        });
                    }
                }
            }
        }
@ -107,24 +80,32 @@ impl ModelMetricsService {
    /// Rank `models` by `policy`, returning them in preference order.
    /// Models with no metric data are appended at the end in their original order.
    pub async fn rank_models(&self, models: &[String], policy: &SelectionPolicy) -> Vec<String> {
        let cost_data = self.cost.read().await;
        let latency_data = self.latency.read().await;
        debug!(
            input_models = ?models,
            cost_data = ?cost_data.iter().collect::<Vec<_>>(),
            latency_data = ?latency_data.iter().collect::<Vec<_>>(),
            prefer = ?policy.prefer,
            "rank_models called"
        );
        match policy.prefer {
            SelectionPreference::Cheapest => {
                let data = self.cost.read().await;
                for m in models {
-                    if !data.contains_key(m.as_str()) {
+                    if !cost_data.contains_key(m.as_str()) {
                        warn!(model = %m, "no cost data for model — ranking last (prefer: cheapest)");
                    }
                }
-                rank_by_ascending_metric(models, &data)
+                rank_by_ascending_metric(models, &cost_data)
            }
            SelectionPreference::Fastest => {
                let data = self.latency.read().await;
                for m in models {
-                    if !data.contains_key(m.as_str()) {
+                    if !latency_data.contains_key(m.as_str()) {
                        warn!(model = %m, "no latency data for model — ranking last (prefer: fastest)");
                    }
                }
-                rank_by_ascending_metric(models, &data)
+                rank_by_ascending_metric(models, &latency_data)
            }
            SelectionPreference::None => models.to_vec(),
        }
@ -144,13 +125,20 @@ impl ModelMetricsService {
 fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> Vec<String> {
    let mut with_data: Vec<(&String, f64)> = models
        .iter()
-        .filter_map(|m| data.get(m.as_str()).map(|v| (m, *v)))
+        .filter_map(|m| {
            let v = *data.get(m.as_str())?;
            if v.is_nan() {
                None
            } else {
                Some((m, v))
            }
        })
        .collect();
    with_data.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
    let without_data: Vec<&String> = models
        .iter()
-        .filter(|m| !data.contains_key(m.as_str()))
+        .filter(|m| data.get(m.as_str()).is_none_or(|v| v.is_nan()))
        .collect();
    with_data
@ -160,43 +148,6 @@ fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> V
        .collect()
 }
 #[derive(serde::Deserialize)]
 struct CostEntry {
    input_per_million: f64,
    output_per_million: f64,
 }
 async fn fetch_cost_metrics(
    url: &str,
    auth: Option<&common::configuration::MetricsAuth>,
    client: &reqwest::Client,
 ) -> HashMap<String, f64> {
    let mut req = client.get(url);
    if let Some(auth) = auth {
        if auth.auth_type == "bearer" {
            req = req.header("Authorization", format!("Bearer {}", auth.token));
        } else {
            warn!(auth_type = %auth.auth_type, "unsupported auth type for cost_metrics, skipping auth");
        }
    }
    match req.send().await {
        Ok(resp) => match resp.json::<HashMap<String, CostEntry>>().await {
            Ok(data) => data
                .into_iter()
                .map(|(k, v)| (k, v.input_per_million + v.output_per_million))
                .collect(),
            Err(err) => {
                warn!(error = %err, url = %url, "failed to parse cost metrics response");
                HashMap::new()
            }
        },
        Err(err) => {
            warn!(error = %err, url = %url, "failed to fetch cost metrics");
            HashMap::new()
        }
    }
 }
 #[derive(serde::Deserialize)]
 struct DoModelList {
    data: Vec<DoModel>,
@ -416,4 +367,22 @@ mod tests {
        // none → original order, despite gpt-4o-mini being cheaper
        assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
    }
    #[test]
    fn test_rank_by_ascending_metric_nan_treated_as_missing() {
        let models = vec![
            "a".to_string(),
            "b".to_string(),
            "c".to_string(),
            "d".to_string(),
        ];
        let mut data = HashMap::new();
        data.insert("a".to_string(), f64::NAN);
        data.insert("b".to_string(), 0.5);
        data.insert("c".to_string(), 0.1);
        // "d" has no entry at all
        let result = rank_by_ascending_metric(&models, &data);
        // c (0.1) < b (0.5), then NaN "a" and missing "d" appended in original order
        assert_eq!(result, vec!["c", "b", "a", "d"]);
    }
 }
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -104,16 +104,17 @@ pub enum StateStorageType {
    Postgres,
 }
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "lowercase")]
 pub enum SelectionPreference {
    Cheapest,
    Fastest,
    /// Return models in the same order they were defined — no reordering.
    #[default]
    None,
 }
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct SelectionPolicy {
    pub prefer: SelectionPreference,
 }
@ -123,36 +124,44 @@ pub struct TopLevelRoutingPreference {
    pub name: String,
    pub description: String,
    pub models: Vec<String>,
    #[serde(default)]
    pub selection_policy: SelectionPolicy,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct MetricsAuth {
    #[serde(rename = "type")]
    pub auth_type: String, // only "bearer" supported
    pub token: String,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "type", rename_all = "snake_case")]
 pub enum MetricsSource {
-    CostMetrics {
+    Cost(CostMetricsConfig),
-        url: String,
+    Latency(LatencyMetricsConfig),
-        refresh_interval: Option<u64>,
+}
-        auth: Option<MetricsAuth>,
+
-    },
+#[derive(Debug, Clone, Serialize, Deserialize)]
-    PrometheusMetrics {
+pub struct CostMetricsConfig {
-        url: String,
+    pub provider: CostProvider,
-        query: String,
+    pub refresh_interval: Option<u64>,
-        refresh_interval: Option<u64>,
+    /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
-    },
+    /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
-    #[serde(rename = "digitalocean_pricing")]
+    pub model_aliases: Option<HashMap<String, String>>,
-    DigitalOceanPricing {
+}
-        refresh_interval: Option<u64>,
+
-        /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
+#[derive(Debug, Clone, Serialize, Deserialize)]
-        /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
+#[serde(rename_all = "snake_case")]
-        model_aliases: Option<HashMap<String, String>>,
+pub enum CostProvider {
-    },
+    Digitalocean,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LatencyMetricsConfig {
    pub provider: LatencyProvider,
    pub url: String,
    pub query: String,
    pub refresh_interval: Option<u64>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub enum LatencyProvider {
    Prometheus,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/demos/llm_routing/model_routing_service/README.md
+++ b/demos/llm_routing/model_routing_service/README.md
@ -13,7 +13,6 @@ Plano is an AI-native proxy and data plane for agentic apps — with built-in or
 - **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover
 - **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request
 - **Cost & latency ranking** — models are ranked by live cost (DigitalOcean pricing API) or latency (Prometheus) before returning the fallback list
 - **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code
 - **Runs anywhere** — single binary; self-host the router for full data privacy
@ -30,38 +29,24 @@ routing_preferences:
    models:
      - openai/gpt-4o
      - openai/gpt-4o-mini
    selection_policy:
      prefer: cheapest        # rank by live cost data
  - name: code_generation
    description: generating new code, writing functions, or creating boilerplate
    models:
      - anthropic/claude-sonnet-4-20250514
      - openai/gpt-4o
    selection_policy:
      prefer: fastest         # rank by Prometheus p95 latency
 ```
 ### `selection_policy.prefer` values
 | Value | Behavior |
 |---|---|
 | `cheapest` | Sort models by ascending cost. Requires `cost_metrics` or `digitalocean_pricing` in `model_metrics_sources`. |
 | `fastest` | Sort models by ascending P95 latency. Requires `prometheus_metrics` in `model_metrics_sources`. |
 | `random` | Shuffle the model list on each request. |
 | `none` | Return models in definition order — no reordering. |
 When a request arrives, Plano:
 1. Sends the conversation + route descriptions to Arch-Router for intent classification
-2. Looks up the matched route and ranks its candidate models by cost or latency
+2. Looks up the matched route and returns its candidate models
 3. Returns an ordered list — client uses `models[0]`, falls back to `models[1]` on 429/5xx
 ```
 1. Request arrives          → "Write binary search in Python"
 2. Arch-Router classifies   → route: "code_generation"
-3. Rank by latency          → claude-sonnet (0.85s) < gpt-4o (1.2s)
+3. Response                 → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"]
 4. Response                 → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"]
 ```
 No match? Arch-Router returns `null` route → client falls back to the model in the original request.
@ -77,28 +62,12 @@ export OPENAI_API_KEY=<your-key>
 export ANTHROPIC_API_KEY=<your-key>
 ```
-Start Prometheus and the mock latency metrics server:
+Start Plano:
 ```bash
-cd demos/llm_routing/model_routing_service
+planoai up demos/llm_routing/model_routing_service/config.yaml
 docker compose up -d
 ```
 Then start Plano:
 ```bash
 planoai up config.yaml
 ```
 On startup you should see logs like:
 ```
 fetched digitalocean pricing: N models
 fetched prometheus latency metrics: 3 models
 ```
 If a model in `routing_preferences` has no matching pricing or latency data, Plano logs a warning at startup — the model is still included but ranked last.
 ## Run the demo
 ```bash
@ -135,59 +104,7 @@ Response:
 }
 ```
-The response contains the ranked model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors.
+The response contains the model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors.
 ## Metrics Sources
 ### DigitalOcean Pricing (`digitalocean_pricing`)
 Fetches public model pricing from the DigitalOcean Gen-AI catalog (no auth required). Model IDs are normalized as `lowercase(creator)/model_id`. Cost scalar = `input_price_per_million + output_price_per_million`.
 ```yaml
 model_metrics_sources:
  - type: digitalocean_pricing
    refresh_interval: 3600   # re-fetch every hour
 ```
 ### Prometheus Latency (`prometheus_metrics`)
 Queries a Prometheus instance for P95 latency. The PromQL expression must return an instant vector with a `model_name` label matching the model names in `routing_preferences`.
 ```yaml
 model_metrics_sources:
  - type: prometheus_metrics
    url: http://localhost:9090
    query: model_latency_p95_seconds
    refresh_interval: 60
 ```
 The demo's `metrics_server.py` exposes mock latency data; `docker compose up -d` starts it alongside Prometheus.
 ### Custom Cost Endpoint (`cost_metrics`)
 ```yaml
 model_metrics_sources:
  - type: cost_metrics
    url: https://my-internal-pricing-api/costs
    auth:
      type: bearer
      token: $PRICING_TOKEN
    refresh_interval: 300
 ```
 Expected response format:
 ```json
 {
  "anthropic/claude-sonnet-4-20250514": {
    "input_per_million": 3.0,
    "output_per_million": 15.0
  },
  "openai/gpt-4o": {
    "input_per_million": 5.0,
    "output_per_million": 20.0
  }
 }
 ```
 ## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
--- a/demos/llm_routing/model_routing_service/config.yaml
+++ b/demos/llm_routing/model_routing_service/config.yaml
@ -22,32 +22,9 @@ routing_preferences:
    models:
      - openai/gpt-4o
      - openai/gpt-4o-mini
    selection_policy:
      prefer: cheapest
  - name: code_generation
    description: generating new code, writing functions, or creating boilerplate
    models:
      - anthropic/claude-sonnet-4-20250514
      - openai/gpt-4o
    selection_policy:
      prefer: fastest
 model_metrics_sources:
  - type: digitalocean_pricing
    refresh_interval: 3600
    model_aliases:
      openai-gpt-4o: openai/gpt-4o
      openai-gpt-4o-mini: openai/gpt-4o-mini
      anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
  # Use cost_metrics instead of digitalocean_pricing to supply your own pricing data.
  # The demo metrics_server.py exposes /costs with OpenAI and Anthropic pricing.
  # - type: cost_metrics
  #   url: http://localhost:8080/costs
  #   refresh_interval: 300
  - type: prometheus_metrics
    url: http://localhost:9090
    query: model_latency_p95_seconds
    refresh_interval: 60
--- a/docs/routing-api.md
+++ b/docs/routing-api.md
@ -21,14 +21,12 @@ POST /v1/chat/completions
    {
      "name": "code generation",
      "description": "generating new code snippets",
-      "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"],
+      "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"]
      "selection_policy": {"prefer": "fastest"}
    },
    {
      "name": "general questions",
      "description": "casual conversation and simple queries",
-      "models": ["openai/gpt-4o-mini"],
+      "models": ["openai/gpt-4o-mini"]
      "selection_policy": {"prefer": "cheapest"}
    }
  ]
 }
@ -41,15 +39,6 @@ POST /v1/chat/completions
 | `name` | string | yes | Route identifier. Must match the LLM router's route classification. |
 | `description` | string | yes | Natural language description used by the router to match user intent. |
 | `models` | string[] | yes | Ordered candidate pool. At least one entry required. Must be declared in `model_providers`. |
 | `selection_policy.prefer` | enum | yes | How to rank models: `cheapest`, `fastest`, or `none`. |
 ### `selection_policy.prefer` values
 | Value | Behavior |
 |---|---|
 | `cheapest` | Sort by ascending cost from the metrics endpoint. Models with no data appended last. |
 | `fastest` | Sort by ascending latency from the metrics endpoint. Models with no data appended last. |
 | `none` | Return models in the order they were defined — no reordering. |
 ### Notes
@ -121,120 +110,14 @@ routing_preferences:
    models:
      - anthropic/claude-sonnet-4-20250514
      - openai/gpt-4o
    selection_policy:
      prefer: fastest
  - name: general questions
    description: casual conversation and simple queries
    models:
      - openai/gpt-4o-mini
      - openai/gpt-4o
    selection_policy:
      prefer: cheapest
 # Optional: live cost and latency data sources (max one per type)
 model_metrics_sources:
  # Option A: DigitalOcean public pricing (no auth required)
  - type: digitalocean_pricing
    refresh_interval: 3600
  # Option B: custom cost endpoint (mutually exclusive with digitalocean_pricing)
  # - type: cost_metrics
  #   url: https://internal-cost-api/models
  #   refresh_interval: 300  # seconds; omit for fetch-once on startup
  #   auth:
  #     type: bearer
  #     token: $COST_API_TOKEN
  - type: prometheus_metrics
    url: https://internal-prometheus/
    query: histogram_quantile(0.95, sum by (model_name, le) (rate(model_latency_seconds_bucket[5m])))
    refresh_interval: 60
 ```
 ### Startup validation
 Plano validates metric source configuration at startup and exits with a clear error if:
 | Condition | Error |
 |---|---|
 | `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost data source — add cost_metrics or digitalocean_pricing` |
 | `prefer: fastest` with no `prometheus_metrics` | `prefer: fastest requires a prometheus_metrics source` |
 | Two `cost_metrics` entries | `only one cost_metrics source is allowed` |
 | Two `prometheus_metrics` entries | `only one prometheus_metrics source is allowed` |
 | Two `digitalocean_pricing` entries | `only one digitalocean_pricing source is allowed` |
 | `cost_metrics` and `digitalocean_pricing` both present | `cannot both be configured — use one or the other` |
 If a model listed in `routing_preferences` has no matching entry in the fetched pricing or latency data, Plano logs a `WARN` at startup — the model is still included but ranked last. The same warning is also emitted per routing request when a model has no data in cache at decision time (relevant for inline `routing_preferences` overrides that reference models not covered by the configured metrics sources).
 ### cost_metrics endpoint
 Plano GETs `url` on startup (and on each `refresh_interval`). Expected response — a JSON object mapping model name to an object with `input_per_million` and `output_per_million` fields:
 ```json
 {
  "anthropic/claude-sonnet-4-20250514": {
    "input_per_million": 3.0,
    "output_per_million": 15.0
  },
  "openai/gpt-4o": {
    "input_per_million": 5.0,
    "output_per_million": 20.0
  },
  "openai/gpt-4o-mini": {
    "input_per_million": 0.15,
    "output_per_million": 0.6
  }
 }
 ```
 - `auth.type: bearer` adds `Authorization: Bearer <token>` to the request
 - Plano combines the two fields as `input_per_million + output_per_million` to produce a single cost scalar used for ranking
 - Only relative order matters — the unit (e.g. USD per million tokens) is consistent so ranking is correct
 ### digitalocean_pricing source
 Fetches public model pricing from the DigitalOcean Gen-AI catalog. No authentication required.
 ```yaml
 model_metrics_sources:
  - type: digitalocean_pricing
    refresh_interval: 3600   # re-fetch every hour; omit to fetch once on startup
    model_aliases:
      openai-gpt-4o: openai/gpt-4o
      openai-gpt-4o-mini: openai/gpt-4o-mini
      anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
 ```
 DO catalog entries are stored by their `model_id` field (e.g. `openai-gpt-4o`). The cost scalar is `input_price_per_million + output_price_per_million`.
 **`model_aliases`** — optional. Maps DO `model_id` values to the model names used in `routing_preferences`. Without aliases, cost data is stored under the DO model_id (e.g. `openai-gpt-4o`), which won't match models configured as `openai/gpt-4o`. Aliases let you bridge the naming gap without changing your routing config.
 **Constraints:**
 - `cost_metrics` and `digitalocean_pricing` cannot both be configured — use one or the other.
 - Only one `digitalocean_pricing` entry is allowed.
 ### prometheus_metrics endpoint
 Plano queries `{url}/api/v1/query?query={query}` on startup and each `refresh_interval`. The PromQL expression must return an instant vector with a `model_name` label:
 ```json
 {
  "status": "success",
  "data": {
    "resultType": "vector",
    "result": [
      {"metric": {"model_name": "anthropic/claude-sonnet-4-20250514"}, "value": [1234567890, "120.5"]},
      {"metric": {"model_name": "openai/gpt-4o"}, "value": [1234567890, "200.3"]}
    ]
  }
 }
 ```
 - The PromQL query is responsible for computing the percentile (e.g. `histogram_quantile(0.95, ...)`)
 - Latency units are arbitrary — only relative order matters
 - Models missing from the result are appended at the end of the ranked list
 ---
 ## Version Requirements