Merge branch 'main' into adil/release-0.4.15

2026-06-17 15:25:17 +02:00 · 2026-03-30 17:22:34 -07:00 · 2026-03-30 17:22:34 -07:00 · 21aa91551d
commit 21aa91551d
parent 0a2e5a1be3 af98c11a6d
7 changed files with 171 additions and 455 deletions
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@ -502,7 +502,6 @@ properties:
        - name
        - description
        - models
-        - selection_policy

  model_metrics_sources:
    type: array
@ -512,52 +511,11 @@ properties:
          properties:
            type:
              type: string
-              const: cost_metrics
-            url:
+              const: cost
+            provider:
              type: string
-            refresh_interval:
-              type: integer
-              minimum: 1
-            auth:
-              type: object
-              properties:
-                type:
-                  type: string
-                  enum:
-                    - bearer
-                token:
-                  type: string
-              required:
-                - type
-                - token
-              additionalProperties: false
-          required:
-            - type
-            - url
-          additionalProperties: false
-        - type: object
-          properties:
-            type:
-              type: string
-              const: prometheus_metrics
-            url:
-              type: string
-            query:
-              type: string
-            refresh_interval:
-              type: integer
-              minimum: 1
-              description: "Refresh interval in seconds"
-          required:
-            - type
-            - url
-            - query
-          additionalProperties: false
-        - type: object
-          properties:
-            type:
-              type: string
-              const: digitalocean_pricing
+              enum:
+                - digitalocean
            refresh_interval:
              type: integer
              minimum: 1
@ -569,6 +527,30 @@ properties:
                type: string
          required:
            - type
+            - provider
+          additionalProperties: false
+        - type: object
+          properties:
+            type:
+              type: string
+              const: latency
+            provider:
+              type: string
+              enum:
+                - prometheus
+            url:
+              type: string
+            query:
+              type: string
+            refresh_interval:
+              type: integer
+              minimum: 1
+              description: "Refresh interval in seconds"
+          required:
+            - type
+            - provider
+            - url
+            - query
          additionalProperties: false

 additionalProperties: false
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -216,33 +216,17 @@ async fn init_app_state(
        use common::configuration::MetricsSource;
        let cost_count = sources
            .iter()
-            .filter(|s| matches!(s, MetricsSource::CostMetrics { .. }))
+            .filter(|s| matches!(s, MetricsSource::Cost(_)))
            .count();
-        let prom_count = sources
+        let latency_count = sources
            .iter()
-            .filter(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }))
-            .count();
-        let do_count = sources
-            .iter()
-            .filter(|s| matches!(s, MetricsSource::DigitalOceanPricing { .. }))
+            .filter(|s| matches!(s, MetricsSource::Latency(_)))
            .count();
        if cost_count > 1 {
-            return Err("model_metrics_sources: only one cost_metrics source is allowed".into());
+            return Err("model_metrics_sources: only one cost metrics source is allowed".into());
        }
-        if prom_count > 1 {
-            return Err(
-                "model_metrics_sources: only one prometheus_metrics source is allowed".into(),
-            );
-        }
-        if do_count > 1 {
-            return Err(
-                "model_metrics_sources: only one digitalocean_pricing source is allowed".into(),
-            );
-        }
-        if cost_count > 0 && do_count > 0 {
-            return Err(
-                "model_metrics_sources: cost_metrics and digitalocean_pricing cannot both be configured — use one or the other".into(),
-            );
+        if latency_count > 1 {
+            return Err("model_metrics_sources: only one latency metrics source is allowed".into());
        }
        let svc = ModelMetricsService::new(sources, reqwest::Client::new()).await;
        Some(Arc::new(svc))
@ -259,32 +243,27 @@ async fn init_app_state(
            .as_deref()
            .unwrap_or_default()
            .iter()
-            .any(|s| {
-                matches!(
-                    s,
-                    MetricsSource::CostMetrics { .. } | MetricsSource::DigitalOceanPricing { .. }
-                )
-            });
-        let has_prometheus = config
+            .any(|s| matches!(s, MetricsSource::Cost(_)));
+        let has_latency_source = config
            .model_metrics_sources
            .as_deref()
            .unwrap_or_default()
            .iter()
-            .any(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }));
+            .any(|s| matches!(s, MetricsSource::Latency(_)));

        for pref in prefs {
            if pref.selection_policy.prefer == SelectionPreference::Cheapest && !has_cost_source {
                return Err(format!(
-                    "routing_preferences route '{}' uses prefer: cheapest but no cost data source is configured — \
-                     add cost_metrics or digitalocean_pricing to model_metrics_sources",
+                    "routing_preferences route '{}' uses prefer: cheapest but no cost metrics source is configured — \
+                     add a cost metrics source to model_metrics_sources",
                    pref.name
                )
                .into());
            }
-            if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_prometheus {
+            if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_latency_source {
                return Err(format!(
-                    "routing_preferences route '{}' uses prefer: fastest but no prometheus_metrics source is configured — \
-                     add prometheus_metrics to model_metrics_sources",
+                    "routing_preferences route '{}' uses prefer: fastest but no latency metrics source is configured — \
+                     add a latency metrics source to model_metrics_sources",
                    pref.name
                )
                .into());
--- a/crates/brightstaff/src/router/model_metrics.rs
+++ b/crates/brightstaff/src/router/model_metrics.rs
@ -2,9 +2,11 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;

-use common::configuration::{MetricsSource, SelectionPolicy, SelectionPreference};
+use common::configuration::{
+    CostProvider, LatencyProvider, MetricsSource, SelectionPolicy, SelectionPreference,
+};
 use tokio::sync::RwLock;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};

 const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog";

@ -20,81 +22,52 @@ impl ModelMetricsService {

        for source in sources {
            match source {
-                MetricsSource::CostMetrics {
-                    url,
-                    refresh_interval,
-                    auth,
-                } => {
-                    let data = fetch_cost_metrics(url, auth.as_ref(), &client).await;
-                    info!(models = data.len(), url = %url, "fetched cost metrics");
-                    *cost_data.write().await = data;
+                MetricsSource::Cost(cfg) => match cfg.provider {
+                    CostProvider::Digitalocean => {
+                        let aliases = cfg.model_aliases.clone().unwrap_or_default();
+                        let data = fetch_do_pricing(&client, &aliases).await;
+                        info!(models = data.len(), "fetched digitalocean pricing");
+                        *cost_data.write().await = data;

-                    if let Some(interval_secs) = refresh_interval {
-                        let cost_clone = Arc::clone(&cost_data);
-                        let client_clone = client.clone();
-                        let url = url.clone();
-                        let auth = auth.clone();
-                        let interval = Duration::from_secs(*interval_secs);
-                        tokio::spawn(async move {
-                            loop {
-                                tokio::time::sleep(interval).await;
-                                let data =
-                                    fetch_cost_metrics(&url, auth.as_ref(), &client_clone).await;
-                                info!(models = data.len(), url = %url, "refreshed cost metrics");
-                                *cost_clone.write().await = data;
-                            }
-                        });
+                        if let Some(interval_secs) = cfg.refresh_interval {
+                            let cost_clone = Arc::clone(&cost_data);
+                            let client_clone = client.clone();
+                            let interval = Duration::from_secs(interval_secs);
+                            tokio::spawn(async move {
+                                loop {
+                                    tokio::time::sleep(interval).await;
+                                    let data = fetch_do_pricing(&client_clone, &aliases).await;
+                                    info!(models = data.len(), "refreshed digitalocean pricing");
+                                    *cost_clone.write().await = data;
+                                }
+                            });
+                        }
                    }
-                }
-                MetricsSource::PrometheusMetrics {
-                    url,
-                    query,
-                    refresh_interval,
-                } => {
-                    let data = fetch_prometheus_metrics(url, query, &client).await;
-                    info!(models = data.len(), url = %url, "fetched prometheus latency metrics");
-                    *latency_data.write().await = data;
+                },
+                MetricsSource::Latency(cfg) => match cfg.provider {
+                    LatencyProvider::Prometheus => {
+                        let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await;
+                        info!(models = data.len(), url = %cfg.url, "fetched latency metrics");
+                        *latency_data.write().await = data;

-                    if let Some(interval_secs) = refresh_interval {
-                        let latency_clone = Arc::clone(&latency_data);
-                        let client_clone = client.clone();
-                        let url = url.clone();
-                        let query = query.clone();
-                        let interval = Duration::from_secs(*interval_secs);
-                        tokio::spawn(async move {
-                            loop {
-                                tokio::time::sleep(interval).await;
-                                let data =
-                                    fetch_prometheus_metrics(&url, &query, &client_clone).await;
-                                info!(models = data.len(), url = %url, "refreshed prometheus latency metrics");
-                                *latency_clone.write().await = data;
-                            }
-                        });
+                        if let Some(interval_secs) = cfg.refresh_interval {
+                            let latency_clone = Arc::clone(&latency_data);
+                            let client_clone = client.clone();
+                            let url = cfg.url.clone();
+                            let query = cfg.query.clone();
+                            let interval = Duration::from_secs(interval_secs);
+                            tokio::spawn(async move {
+                                loop {
+                                    tokio::time::sleep(interval).await;
+                                    let data =
+                                        fetch_prometheus_metrics(&url, &query, &client_clone).await;
+                                    info!(models = data.len(), url = %url, "refreshed latency metrics");
+                                    *latency_clone.write().await = data;
+                                }
+                            });
+                        }
                    }
-                }
-                MetricsSource::DigitalOceanPricing {
-                    refresh_interval,
-                    model_aliases,
-                } => {
-                    let aliases = model_aliases.clone().unwrap_or_default();
-                    let data = fetch_do_pricing(&client, &aliases).await;
-                    info!(models = data.len(), "fetched digitalocean pricing");
-                    *cost_data.write().await = data;
-
-                    if let Some(interval_secs) = refresh_interval {
-                        let cost_clone = Arc::clone(&cost_data);
-                        let client_clone = client.clone();
-                        let interval = Duration::from_secs(*interval_secs);
-                        tokio::spawn(async move {
-                            loop {
-                                tokio::time::sleep(interval).await;
-                                let data = fetch_do_pricing(&client_clone, &aliases).await;
-                                info!(models = data.len(), "refreshed digitalocean pricing");
-                                *cost_clone.write().await = data;
-                            }
-                        });
-                    }
-                }
+                },
            }
        }

@ -107,24 +80,32 @@ impl ModelMetricsService {
    /// Rank `models` by `policy`, returning them in preference order.
    /// Models with no metric data are appended at the end in their original order.
    pub async fn rank_models(&self, models: &[String], policy: &SelectionPolicy) -> Vec<String> {
+        let cost_data = self.cost.read().await;
+        let latency_data = self.latency.read().await;
+        debug!(
+            input_models = ?models,
+            cost_data = ?cost_data.iter().collect::<Vec<_>>(),
+            latency_data = ?latency_data.iter().collect::<Vec<_>>(),
+            prefer = ?policy.prefer,
+            "rank_models called"
+        );
+
        match policy.prefer {
            SelectionPreference::Cheapest => {
-                let data = self.cost.read().await;
                for m in models {
-                    if !data.contains_key(m.as_str()) {
+                    if !cost_data.contains_key(m.as_str()) {
                        warn!(model = %m, "no cost data for model — ranking last (prefer: cheapest)");
                    }
                }
-                rank_by_ascending_metric(models, &data)
+                rank_by_ascending_metric(models, &cost_data)
            }
            SelectionPreference::Fastest => {
-                let data = self.latency.read().await;
                for m in models {
-                    if !data.contains_key(m.as_str()) {
+                    if !latency_data.contains_key(m.as_str()) {
                        warn!(model = %m, "no latency data for model — ranking last (prefer: fastest)");
                    }
                }
-                rank_by_ascending_metric(models, &data)
+                rank_by_ascending_metric(models, &latency_data)
            }
            SelectionPreference::None => models.to_vec(),
        }
@ -144,13 +125,20 @@ impl ModelMetricsService {
 fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> Vec<String> {
    let mut with_data: Vec<(&String, f64)> = models
        .iter()
-        .filter_map(|m| data.get(m.as_str()).map(|v| (m, *v)))
+        .filter_map(|m| {
+            let v = *data.get(m.as_str())?;
+            if v.is_nan() {
+                None
+            } else {
+                Some((m, v))
+            }
+        })
        .collect();
    with_data.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));

    let without_data: Vec<&String> = models
        .iter()
-        .filter(|m| !data.contains_key(m.as_str()))
+        .filter(|m| data.get(m.as_str()).is_none_or(|v| v.is_nan()))
        .collect();

    with_data
@ -160,43 +148,6 @@ fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> V
        .collect()
 }

-#[derive(serde::Deserialize)]
-struct CostEntry {
-    input_per_million: f64,
-    output_per_million: f64,
-}
-
-async fn fetch_cost_metrics(
-    url: &str,
-    auth: Option<&common::configuration::MetricsAuth>,
-    client: &reqwest::Client,
-) -> HashMap<String, f64> {
-    let mut req = client.get(url);
-    if let Some(auth) = auth {
-        if auth.auth_type == "bearer" {
-            req = req.header("Authorization", format!("Bearer {}", auth.token));
-        } else {
-            warn!(auth_type = %auth.auth_type, "unsupported auth type for cost_metrics, skipping auth");
-        }
-    }
-    match req.send().await {
-        Ok(resp) => match resp.json::<HashMap<String, CostEntry>>().await {
-            Ok(data) => data
-                .into_iter()
-                .map(|(k, v)| (k, v.input_per_million + v.output_per_million))
-                .collect(),
-            Err(err) => {
-                warn!(error = %err, url = %url, "failed to parse cost metrics response");
-                HashMap::new()
-            }
-        },
-        Err(err) => {
-            warn!(error = %err, url = %url, "failed to fetch cost metrics");
-            HashMap::new()
-        }
-    }
-}
-
 #[derive(serde::Deserialize)]
 struct DoModelList {
    data: Vec<DoModel>,
@ -416,4 +367,22 @@ mod tests {
        // none → original order, despite gpt-4o-mini being cheaper
        assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
    }
+
+    #[test]
+    fn test_rank_by_ascending_metric_nan_treated_as_missing() {
+        let models = vec![
+            "a".to_string(),
+            "b".to_string(),
+            "c".to_string(),
+            "d".to_string(),
+        ];
+        let mut data = HashMap::new();
+        data.insert("a".to_string(), f64::NAN);
+        data.insert("b".to_string(), 0.5);
+        data.insert("c".to_string(), 0.1);
+        // "d" has no entry at all
+        let result = rank_by_ascending_metric(&models, &data);
+        // c (0.1) < b (0.5), then NaN "a" and missing "d" appended in original order
+        assert_eq!(result, vec!["c", "b", "a", "d"]);
+    }
 }
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -104,16 +104,17 @@ pub enum StateStorageType {
    Postgres,
 }

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "lowercase")]
 pub enum SelectionPreference {
    Cheapest,
    Fastest,
    /// Return models in the same order they were defined — no reordering.
+    #[default]
    None,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct SelectionPolicy {
    pub prefer: SelectionPreference,
 }
@ -123,36 +124,44 @@ pub struct TopLevelRoutingPreference {
    pub name: String,
    pub description: String,
    pub models: Vec<String>,
+    #[serde(default)]
    pub selection_policy: SelectionPolicy,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct MetricsAuth {
-    #[serde(rename = "type")]
-    pub auth_type: String, // only "bearer" supported
-    pub token: String,
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "type", rename_all = "snake_case")]
 pub enum MetricsSource {
-    CostMetrics {
-        url: String,
-        refresh_interval: Option<u64>,
-        auth: Option<MetricsAuth>,
-    },
-    PrometheusMetrics {
-        url: String,
-        query: String,
-        refresh_interval: Option<u64>,
-    },
-    #[serde(rename = "digitalocean_pricing")]
-    DigitalOceanPricing {
-        refresh_interval: Option<u64>,
-        /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
-        /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
-        model_aliases: Option<HashMap<String, String>>,
-    },
+    Cost(CostMetricsConfig),
+    Latency(LatencyMetricsConfig),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CostMetricsConfig {
+    pub provider: CostProvider,
+    pub refresh_interval: Option<u64>,
+    /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
+    /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
+    pub model_aliases: Option<HashMap<String, String>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum CostProvider {
+    Digitalocean,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LatencyMetricsConfig {
+    pub provider: LatencyProvider,
+    pub url: String,
+    pub query: String,
+    pub refresh_interval: Option<u64>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum LatencyProvider {
+    Prometheus,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/demos/llm_routing/model_routing_service/README.md
+++ b/demos/llm_routing/model_routing_service/README.md
@ -13,7 +13,6 @@ Plano is an AI-native proxy and data plane for agentic apps — with built-in or

 - **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover
 - **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request
- **Cost & latency ranking** — models are ranked by live cost (DigitalOcean pricing API) or latency (Prometheus) before returning the fallback list
 - **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code
 - **Runs anywhere** — single binary; self-host the router for full data privacy

@ -30,38 +29,24 @@ routing_preferences:
    models:
      - openai/gpt-4o
      - openai/gpt-4o-mini
-    selection_policy:
-      prefer: cheapest        # rank by live cost data

  - name: code_generation
    description: generating new code, writing functions, or creating boilerplate
    models:
      - anthropic/claude-sonnet-4-20250514
      - openai/gpt-4o
-    selection_policy:
-      prefer: fastest         # rank by Prometheus p95 latency
 ```

-### `selection_policy.prefer` values
-
-| Value | Behavior |
-|---|---|
-| `cheapest` | Sort models by ascending cost. Requires `cost_metrics` or `digitalocean_pricing` in `model_metrics_sources`. |
-| `fastest` | Sort models by ascending P95 latency. Requires `prometheus_metrics` in `model_metrics_sources`. |
-| `random` | Shuffle the model list on each request. |
-| `none` | Return models in definition order — no reordering. |
-
 When a request arrives, Plano:

 1. Sends the conversation + route descriptions to Arch-Router for intent classification
-2. Looks up the matched route and ranks its candidate models by cost or latency
+2. Looks up the matched route and returns its candidate models
 3. Returns an ordered list — client uses `models[0]`, falls back to `models[1]` on 429/5xx

 ```
 1. Request arrives          → "Write binary search in Python"
 2. Arch-Router classifies   → route: "code_generation"
-3. Rank by latency          → claude-sonnet (0.85s) < gpt-4o (1.2s)
-4. Response                 → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"]
+3. Response                 → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"]
 ```

 No match? Arch-Router returns `null` route → client falls back to the model in the original request.
@ -77,28 +62,12 @@ export OPENAI_API_KEY=<your-key>
 export ANTHROPIC_API_KEY=<your-key>
 ```

-Start Prometheus and the mock latency metrics server:
+Start Plano:

 ```bash
-cd demos/llm_routing/model_routing_service
-docker compose up -d
+planoai up demos/llm_routing/model_routing_service/config.yaml
 ```

-Then start Plano:
-
-```bash
-planoai up config.yaml
-```
-
-On startup you should see logs like:
-
-```
-fetched digitalocean pricing: N models
-fetched prometheus latency metrics: 3 models
-```
-
-If a model in `routing_preferences` has no matching pricing or latency data, Plano logs a warning at startup — the model is still included but ranked last.
-
 ## Run the demo

 ```bash
@ -135,59 +104,7 @@ Response:
 }
 ```

-The response contains the ranked model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors.
-
-## Metrics Sources
-
-### DigitalOcean Pricing (`digitalocean_pricing`)
-
-Fetches public model pricing from the DigitalOcean Gen-AI catalog (no auth required). Model IDs are normalized as `lowercase(creator)/model_id`. Cost scalar = `input_price_per_million + output_price_per_million`.
-
-```yaml
-model_metrics_sources:
-  - type: digitalocean_pricing
-    refresh_interval: 3600   # re-fetch every hour
-```
-
-### Prometheus Latency (`prometheus_metrics`)
-
-Queries a Prometheus instance for P95 latency. The PromQL expression must return an instant vector with a `model_name` label matching the model names in `routing_preferences`.
-
-```yaml
-model_metrics_sources:
-  - type: prometheus_metrics
-    url: http://localhost:9090
-    query: model_latency_p95_seconds
-    refresh_interval: 60
-```
-
-The demo's `metrics_server.py` exposes mock latency data; `docker compose up -d` starts it alongside Prometheus.
-
-### Custom Cost Endpoint (`cost_metrics`)
-
-```yaml
-model_metrics_sources:
-  - type: cost_metrics
-    url: https://my-internal-pricing-api/costs
-    auth:
-      type: bearer
-      token: $PRICING_TOKEN
-    refresh_interval: 300
-```
-
-Expected response format:
-```json
-{
-  "anthropic/claude-sonnet-4-20250514": {
-    "input_per_million": 3.0,
-    "output_per_million": 15.0
-  },
-  "openai/gpt-4o": {
-    "input_per_million": 5.0,
-    "output_per_million": 20.0
-  }
-}
-```
+The response contains the model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors.

 ## Kubernetes Deployment (Self-hosted Arch-Router on GPU)

--- a/demos/llm_routing/model_routing_service/config.yaml
+++ b/demos/llm_routing/model_routing_service/config.yaml
@ -22,32 +22,9 @@ routing_preferences:
    models:
      - openai/gpt-4o
      - openai/gpt-4o-mini
-    selection_policy:
-      prefer: cheapest

  - name: code_generation
    description: generating new code, writing functions, or creating boilerplate
    models:
      - anthropic/claude-sonnet-4-20250514
      - openai/gpt-4o
-    selection_policy:
-      prefer: fastest
-
-model_metrics_sources:
-  - type: digitalocean_pricing
-    refresh_interval: 3600
-    model_aliases:
-      openai-gpt-4o: openai/gpt-4o
-      openai-gpt-4o-mini: openai/gpt-4o-mini
-      anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
-
-  # Use cost_metrics instead of digitalocean_pricing to supply your own pricing data.
-  # The demo metrics_server.py exposes /costs with OpenAI and Anthropic pricing.
-  # - type: cost_metrics
-  #   url: http://localhost:8080/costs
-  #   refresh_interval: 300
-
-  - type: prometheus_metrics
-    url: http://localhost:9090
-    query: model_latency_p95_seconds
-    refresh_interval: 60
--- a/docs/routing-api.md
+++ b/docs/routing-api.md
@ -21,14 +21,12 @@ POST /v1/chat/completions
    {
      "name": "code generation",
      "description": "generating new code snippets",
-      "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"],
-      "selection_policy": {"prefer": "fastest"}
+      "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"]
    },
    {
      "name": "general questions",
      "description": "casual conversation and simple queries",
-      "models": ["openai/gpt-4o-mini"],
-      "selection_policy": {"prefer": "cheapest"}
+      "models": ["openai/gpt-4o-mini"]
    }
  ]
 }
@ -41,15 +39,6 @@ POST /v1/chat/completions
 | `name` | string | yes | Route identifier. Must match the LLM router's route classification. |
 | `description` | string | yes | Natural language description used by the router to match user intent. |
 | `models` | string[] | yes | Ordered candidate pool. At least one entry required. Must be declared in `model_providers`. |
-| `selection_policy.prefer` | enum | yes | How to rank models: `cheapest`, `fastest`, or `none`. |
-
-### `selection_policy.prefer` values
-
-| Value | Behavior |
-|---|---|
-| `cheapest` | Sort by ascending cost from the metrics endpoint. Models with no data appended last. |
-| `fastest` | Sort by ascending latency from the metrics endpoint. Models with no data appended last. |
-| `none` | Return models in the order they were defined — no reordering. |

 ### Notes

@ -121,120 +110,14 @@ routing_preferences:
    models:
      - anthropic/claude-sonnet-4-20250514
      - openai/gpt-4o
-    selection_policy:
-      prefer: fastest

  - name: general questions
    description: casual conversation and simple queries
    models:
      - openai/gpt-4o-mini
      - openai/gpt-4o
-    selection_policy:
-      prefer: cheapest
-
-# Optional: live cost and latency data sources (max one per type)
-model_metrics_sources:
-  # Option A: DigitalOcean public pricing (no auth required)
-  - type: digitalocean_pricing
-    refresh_interval: 3600
-
-  # Option B: custom cost endpoint (mutually exclusive with digitalocean_pricing)
-  # - type: cost_metrics
-  #   url: https://internal-cost-api/models
-  #   refresh_interval: 300  # seconds; omit for fetch-once on startup
-  #   auth:
-  #     type: bearer
-  #     token: $COST_API_TOKEN
-
-  - type: prometheus_metrics
-    url: https://internal-prometheus/
-    query: histogram_quantile(0.95, sum by (model_name, le) (rate(model_latency_seconds_bucket[5m])))
-    refresh_interval: 60
 ```

-### Startup validation
-
-Plano validates metric source configuration at startup and exits with a clear error if:
-
-| Condition | Error |
-|---|---|
-| `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost data source — add cost_metrics or digitalocean_pricing` |
-| `prefer: fastest` with no `prometheus_metrics` | `prefer: fastest requires a prometheus_metrics source` |
-| Two `cost_metrics` entries | `only one cost_metrics source is allowed` |
-| Two `prometheus_metrics` entries | `only one prometheus_metrics source is allowed` |
-| Two `digitalocean_pricing` entries | `only one digitalocean_pricing source is allowed` |
-| `cost_metrics` and `digitalocean_pricing` both present | `cannot both be configured — use one or the other` |
-
-If a model listed in `routing_preferences` has no matching entry in the fetched pricing or latency data, Plano logs a `WARN` at startup — the model is still included but ranked last. The same warning is also emitted per routing request when a model has no data in cache at decision time (relevant for inline `routing_preferences` overrides that reference models not covered by the configured metrics sources).
-
-### cost_metrics endpoint
-
-Plano GETs `url` on startup (and on each `refresh_interval`). Expected response — a JSON object mapping model name to an object with `input_per_million` and `output_per_million` fields:
-
-```json
-{
-  "anthropic/claude-sonnet-4-20250514": {
-    "input_per_million": 3.0,
-    "output_per_million": 15.0
-  },
-  "openai/gpt-4o": {
-    "input_per_million": 5.0,
-    "output_per_million": 20.0
-  },
-  "openai/gpt-4o-mini": {
-    "input_per_million": 0.15,
-    "output_per_million": 0.6
-  }
-}
-```
-
- `auth.type: bearer` adds `Authorization: Bearer <token>` to the request
- Plano combines the two fields as `input_per_million + output_per_million` to produce a single cost scalar used for ranking
- Only relative order matters — the unit (e.g. USD per million tokens) is consistent so ranking is correct
-
-### digitalocean_pricing source
-
-Fetches public model pricing from the DigitalOcean Gen-AI catalog. No authentication required.
-
-```yaml
-model_metrics_sources:
-  - type: digitalocean_pricing
-    refresh_interval: 3600   # re-fetch every hour; omit to fetch once on startup
-    model_aliases:
-      openai-gpt-4o: openai/gpt-4o
-      openai-gpt-4o-mini: openai/gpt-4o-mini
-      anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
-```
-
-DO catalog entries are stored by their `model_id` field (e.g. `openai-gpt-4o`). The cost scalar is `input_price_per_million + output_price_per_million`.
-
-**`model_aliases`** — optional. Maps DO `model_id` values to the model names used in `routing_preferences`. Without aliases, cost data is stored under the DO model_id (e.g. `openai-gpt-4o`), which won't match models configured as `openai/gpt-4o`. Aliases let you bridge the naming gap without changing your routing config.
-
-**Constraints:**
- `cost_metrics` and `digitalocean_pricing` cannot both be configured — use one or the other.
- Only one `digitalocean_pricing` entry is allowed.
-
-### prometheus_metrics endpoint
-
-Plano queries `{url}/api/v1/query?query={query}` on startup and each `refresh_interval`. The PromQL expression must return an instant vector with a `model_name` label:
-
-```json
-{
-  "status": "success",
-  "data": {
-    "resultType": "vector",
-    "result": [
-      {"metric": {"model_name": "anthropic/claude-sonnet-4-20250514"}, "value": [1234567890, "120.5"]},
-      {"metric": {"model_name": "openai/gpt-4o"}, "value": [1234567890, "200.3"]}
-    ]
-  }
-}
-```
-
- The PromQL query is responsible for computing the percentile (e.g. `histogram_quantile(0.95, ...)`)
- Latency units are arbitrary — only relative order matters
- Models missing from the result are appended at the end of the ranked list
-
 ---

 ## Version Requirements