restructure model_metrics_sources to use type + provider pattern

2026-07-23 16:51:04 +02:00 · 2026-03-30 15:18:04 -07:00 · 2026-03-30 15:18:04 -07:00 · ba701264be
commit ba701264be
parent e5751d6b13
7 changed files with 142 additions and 299 deletions
--- a/config/plano_config_schema.yaml
+++ b/config/plano_config_schema.yaml
@ -512,52 +512,11 @@ properties:
          properties:
            type:
              type: string
-              const: cost_metrics
-            url:
+              const: cost
+            provider:
              type: string
-            refresh_interval:
-              type: integer
-              minimum: 1
-            auth:
-              type: object
-              properties:
-                type:
-                  type: string
-                  enum:
-                    - bearer
-                token:
-                  type: string
-              required:
-                - type
-                - token
-              additionalProperties: false
-          required:
-            - type
-            - url
-          additionalProperties: false
-        - type: object
-          properties:
-            type:
-              type: string
-              const: prometheus_metrics
-            url:
-              type: string
-            query:
-              type: string
-            refresh_interval:
-              type: integer
-              minimum: 1
-              description: "Refresh interval in seconds"
-          required:
-            - type
-            - url
-            - query
-          additionalProperties: false
-        - type: object
-          properties:
-            type:
-              type: string
-              const: digitalocean_pricing
+              enum:
+                - digitalocean
            refresh_interval:
              type: integer
              minimum: 1
@ -569,6 +528,30 @@ properties:
                type: string
          required:
            - type
+            - provider
+          additionalProperties: false
+        - type: object
+          properties:
+            type:
+              type: string
+              const: latency
+            provider:
+              type: string
+              enum:
+                - prometheus
+            url:
+              type: string
+            query:
+              type: string
+            refresh_interval:
+              type: integer
+              minimum: 1
+              description: "Refresh interval in seconds"
+          required:
+            - type
+            - provider
+            - url
+            - query
          additionalProperties: false

 additionalProperties: false
--- a/crates/brightstaff/src/main.rs
+++ b/crates/brightstaff/src/main.rs
@ -216,33 +216,17 @@ async fn init_app_state(
        use common::configuration::MetricsSource;
        let cost_count = sources
            .iter()
-            .filter(|s| matches!(s, MetricsSource::CostMetrics { .. }))
+            .filter(|s| matches!(s, MetricsSource::Cost(_)))
            .count();
-        let prom_count = sources
+        let latency_count = sources
            .iter()
-            .filter(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }))
-            .count();
-        let do_count = sources
-            .iter()
-            .filter(|s| matches!(s, MetricsSource::DigitalOceanPricing { .. }))
+            .filter(|s| matches!(s, MetricsSource::Latency(_)))
            .count();
        if cost_count > 1 {
-            return Err("model_metrics_sources: only one cost_metrics source is allowed".into());
+            return Err("model_metrics_sources: only one cost metrics source is allowed".into());
        }
-        if prom_count > 1 {
-            return Err(
-                "model_metrics_sources: only one prometheus_metrics source is allowed".into(),
-            );
-        }
-        if do_count > 1 {
-            return Err(
-                "model_metrics_sources: only one digitalocean_pricing source is allowed".into(),
-            );
-        }
-        if cost_count > 0 && do_count > 0 {
-            return Err(
-                "model_metrics_sources: cost_metrics and digitalocean_pricing cannot both be configured — use one or the other".into(),
-            );
+        if latency_count > 1 {
+            return Err("model_metrics_sources: only one latency metrics source is allowed".into());
        }
        let svc = ModelMetricsService::new(sources, reqwest::Client::new()).await;
        Some(Arc::new(svc))
@ -259,32 +243,27 @@ async fn init_app_state(
            .as_deref()
            .unwrap_or_default()
            .iter()
-            .any(|s| {
-                matches!(
-                    s,
-                    MetricsSource::CostMetrics { .. } | MetricsSource::DigitalOceanPricing { .. }
-                )
-            });
-        let has_prometheus = config
+            .any(|s| matches!(s, MetricsSource::Cost(_)));
+        let has_latency_source = config
            .model_metrics_sources
            .as_deref()
            .unwrap_or_default()
            .iter()
-            .any(|s| matches!(s, MetricsSource::PrometheusMetrics { .. }));
+            .any(|s| matches!(s, MetricsSource::Latency(_)));

        for pref in prefs {
            if pref.selection_policy.prefer == SelectionPreference::Cheapest && !has_cost_source {
                return Err(format!(
-                    "routing_preferences route '{}' uses prefer: cheapest but no cost data source is configured — \
-                     add cost_metrics or digitalocean_pricing to model_metrics_sources",
+                    "routing_preferences route '{}' uses prefer: cheapest but no cost metrics source is configured — \
+                     add a cost metrics source to model_metrics_sources",
                    pref.name
                )
                .into());
            }
-            if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_prometheus {
+            if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_latency_source {
                return Err(format!(
-                    "routing_preferences route '{}' uses prefer: fastest but no prometheus_metrics source is configured — \
-                     add prometheus_metrics to model_metrics_sources",
+                    "routing_preferences route '{}' uses prefer: fastest but no latency metrics source is configured — \
+                     add a latency metrics source to model_metrics_sources",
                    pref.name
                )
                .into());
--- a/crates/brightstaff/src/router/model_metrics.rs
+++ b/crates/brightstaff/src/router/model_metrics.rs
@ -2,7 +2,9 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;

-use common::configuration::{MetricsSource, SelectionPolicy, SelectionPreference};
+use common::configuration::{
+    CostProvider, LatencyProvider, MetricsSource, SelectionPolicy, SelectionPreference,
+};
 use tokio::sync::RwLock;
 use tracing::{info, warn};

@ -20,81 +22,52 @@ impl ModelMetricsService {

        for source in sources {
            match source {
-                MetricsSource::CostMetrics {
-                    url,
-                    refresh_interval,
-                    auth,
-                } => {
-                    let data = fetch_cost_metrics(url, auth.as_ref(), &client).await;
-                    info!(models = data.len(), url = %url, "fetched cost metrics");
-                    *cost_data.write().await = data;
+                MetricsSource::Cost(cfg) => match cfg.provider {
+                    CostProvider::Digitalocean => {
+                        let aliases = cfg.model_aliases.clone().unwrap_or_default();
+                        let data = fetch_do_pricing(&client, &aliases).await;
+                        info!(models = data.len(), "fetched digitalocean pricing");
+                        *cost_data.write().await = data;

-                    if let Some(interval_secs) = refresh_interval {
-                        let cost_clone = Arc::clone(&cost_data);
-                        let client_clone = client.clone();
-                        let url = url.clone();
-                        let auth = auth.clone();
-                        let interval = Duration::from_secs(*interval_secs);
-                        tokio::spawn(async move {
-                            loop {
-                                tokio::time::sleep(interval).await;
-                                let data =
-                                    fetch_cost_metrics(&url, auth.as_ref(), &client_clone).await;
-                                info!(models = data.len(), url = %url, "refreshed cost metrics");
-                                *cost_clone.write().await = data;
-                            }
-                        });
+                        if let Some(interval_secs) = cfg.refresh_interval {
+                            let cost_clone = Arc::clone(&cost_data);
+                            let client_clone = client.clone();
+                            let interval = Duration::from_secs(interval_secs);
+                            tokio::spawn(async move {
+                                loop {
+                                    tokio::time::sleep(interval).await;
+                                    let data = fetch_do_pricing(&client_clone, &aliases).await;
+                                    info!(models = data.len(), "refreshed digitalocean pricing");
+                                    *cost_clone.write().await = data;
+                                }
+                            });
+                        }
                    }
-                }
-                MetricsSource::PrometheusMetrics {
-                    url,
-                    query,
-                    refresh_interval,
-                } => {
-                    let data = fetch_prometheus_metrics(url, query, &client).await;
-                    info!(models = data.len(), url = %url, "fetched prometheus latency metrics");
-                    *latency_data.write().await = data;
+                },
+                MetricsSource::Latency(cfg) => match cfg.provider {
+                    LatencyProvider::Prometheus => {
+                        let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await;
+                        info!(models = data.len(), url = %cfg.url, "fetched latency metrics");
+                        *latency_data.write().await = data;

-                    if let Some(interval_secs) = refresh_interval {
-                        let latency_clone = Arc::clone(&latency_data);
-                        let client_clone = client.clone();
-                        let url = url.clone();
-                        let query = query.clone();
-                        let interval = Duration::from_secs(*interval_secs);
-                        tokio::spawn(async move {
-                            loop {
-                                tokio::time::sleep(interval).await;
-                                let data =
-                                    fetch_prometheus_metrics(&url, &query, &client_clone).await;
-                                info!(models = data.len(), url = %url, "refreshed prometheus latency metrics");
-                                *latency_clone.write().await = data;
-                            }
-                        });
+                        if let Some(interval_secs) = cfg.refresh_interval {
+                            let latency_clone = Arc::clone(&latency_data);
+                            let client_clone = client.clone();
+                            let url = cfg.url.clone();
+                            let query = cfg.query.clone();
+                            let interval = Duration::from_secs(interval_secs);
+                            tokio::spawn(async move {
+                                loop {
+                                    tokio::time::sleep(interval).await;
+                                    let data =
+                                        fetch_prometheus_metrics(&url, &query, &client_clone).await;
+                                    info!(models = data.len(), url = %url, "refreshed latency metrics");
+                                    *latency_clone.write().await = data;
+                                }
+                            });
+                        }
                    }
-                }
-                MetricsSource::DigitalOceanPricing {
-                    refresh_interval,
-                    model_aliases,
-                } => {
-                    let aliases = model_aliases.clone().unwrap_or_default();
-                    let data = fetch_do_pricing(&client, &aliases).await;
-                    info!(models = data.len(), "fetched digitalocean pricing");
-                    *cost_data.write().await = data;
-
-                    if let Some(interval_secs) = refresh_interval {
-                        let cost_clone = Arc::clone(&cost_data);
-                        let client_clone = client.clone();
-                        let interval = Duration::from_secs(*interval_secs);
-                        tokio::spawn(async move {
-                            loop {
-                                tokio::time::sleep(interval).await;
-                                let data = fetch_do_pricing(&client_clone, &aliases).await;
-                                info!(models = data.len(), "refreshed digitalocean pricing");
-                                *cost_clone.write().await = data;
-                            }
-                        });
-                    }
-                }
+                },
            }
        }

@ -160,43 +133,6 @@ fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> V
        .collect()
 }

-#[derive(serde::Deserialize)]
-struct CostEntry {
-    input_per_million: f64,
-    output_per_million: f64,
-}
-
-async fn fetch_cost_metrics(
-    url: &str,
-    auth: Option<&common::configuration::MetricsAuth>,
-    client: &reqwest::Client,
-) -> HashMap<String, f64> {
-    let mut req = client.get(url);
-    if let Some(auth) = auth {
-        if auth.auth_type == "bearer" {
-            req = req.header("Authorization", format!("Bearer {}", auth.token));
-        } else {
-            warn!(auth_type = %auth.auth_type, "unsupported auth type for cost_metrics, skipping auth");
-        }
-    }
-    match req.send().await {
-        Ok(resp) => match resp.json::<HashMap<String, CostEntry>>().await {
-            Ok(data) => data
-                .into_iter()
-                .map(|(k, v)| (k, v.input_per_million + v.output_per_million))
-                .collect(),
-            Err(err) => {
-                warn!(error = %err, url = %url, "failed to parse cost metrics response");
-                HashMap::new()
-            }
-        },
-        Err(err) => {
-            warn!(error = %err, url = %url, "failed to fetch cost metrics");
-            HashMap::new()
-        }
-    }
-}
-
 #[derive(serde::Deserialize)]
 struct DoModelList {
    data: Vec<DoModel>,
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -127,32 +127,39 @@ pub struct TopLevelRoutingPreference {
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct MetricsAuth {
-    #[serde(rename = "type")]
-    pub auth_type: String, // only "bearer" supported
-    pub token: String,
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum MetricsSource {
+    Cost(CostMetricsConfig),
+    Latency(LatencyMetricsConfig),
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum MetricsSource {
-    CostMetrics {
-        url: String,
-        refresh_interval: Option<u64>,
-        auth: Option<MetricsAuth>,
-    },
-    PrometheusMetrics {
-        url: String,
-        query: String,
-        refresh_interval: Option<u64>,
-    },
-    #[serde(rename = "digitalocean_pricing")]
-    DigitalOceanPricing {
-        refresh_interval: Option<u64>,
-        /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
-        /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
-        model_aliases: Option<HashMap<String, String>>,
-    },
+pub struct CostMetricsConfig {
+    pub provider: CostProvider,
+    pub refresh_interval: Option<u64>,
+    /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
+    /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
+    pub model_aliases: Option<HashMap<String, String>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum CostProvider {
+    Digitalocean,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LatencyMetricsConfig {
+    pub provider: LatencyProvider,
+    pub url: String,
+    pub query: String,
+    pub refresh_interval: Option<u64>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum LatencyProvider {
+    Prometheus,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/demos/llm_routing/model_routing_service/README.md
+++ b/demos/llm_routing/model_routing_service/README.md
@ -46,8 +46,8 @@ routing_preferences:

 | Value | Behavior |
 |---|---|
-| `cheapest` | Sort models by ascending cost. Requires `cost_metrics` or `digitalocean_pricing` in `model_metrics_sources`. |
-| `fastest` | Sort models by ascending P95 latency. Requires `prometheus_metrics` in `model_metrics_sources`. |
+| `cheapest` | Sort models by ascending cost. Requires a `type: cost` source in `model_metrics_sources`. |
+| `fastest` | Sort models by ascending P95 latency. Requires a `type: latency` source in `model_metrics_sources`. |
 | `random` | Shuffle the model list on each request. |
 | `none` | Return models in definition order — no reordering. |

@ -139,23 +139,25 @@ The response contains the ranked model list — your client should try `models[0

 ## Metrics Sources

-### DigitalOcean Pricing (`digitalocean_pricing`)
+### Cost Metrics (provider: digitalocean)

 Fetches public model pricing from the DigitalOcean Gen-AI catalog (no auth required). Model IDs are normalized as `lowercase(creator)/model_id`. Cost scalar = `input_price_per_million + output_price_per_million`.

 ```yaml
 model_metrics_sources:
-  - type: digitalocean_pricing
+  - type: cost
+    provider: digitalocean
    refresh_interval: 3600   # re-fetch every hour
 ```

-### Prometheus Latency (`prometheus_metrics`)
+### Latency Metrics (provider: prometheus)

 Queries a Prometheus instance for P95 latency. The PromQL expression must return an instant vector with a `model_name` label matching the model names in `routing_preferences`.

 ```yaml
 model_metrics_sources:
-  - type: prometheus_metrics
+  - type: latency
+    provider: prometheus
    url: http://localhost:9090
    query: model_latency_p95_seconds
    refresh_interval: 60
@ -163,32 +165,6 @@ model_metrics_sources:

 The demo's `metrics_server.py` exposes mock latency data; `docker compose up -d` starts it alongside Prometheus.

-### Custom Cost Endpoint (`cost_metrics`)
-
-```yaml
-model_metrics_sources:
-  - type: cost_metrics
-    url: https://my-internal-pricing-api/costs
-    auth:
-      type: bearer
-      token: $PRICING_TOKEN
-    refresh_interval: 300
-```
-
-Expected response format:
-```json
-{
-  "anthropic/claude-sonnet-4-20250514": {
-    "input_per_million": 3.0,
-    "output_per_million": 15.0
-  },
-  "openai/gpt-4o": {
-    "input_per_million": 5.0,
-    "output_per_million": 20.0
-  }
-}
-```
-
 ## Kubernetes Deployment (Self-hosted Arch-Router on GPU)

 To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint:
--- a/demos/llm_routing/model_routing_service/config.yaml
+++ b/demos/llm_routing/model_routing_service/config.yaml
@ -34,20 +34,16 @@ routing_preferences:
      prefer: fastest

 model_metrics_sources:
-  - type: digitalocean_pricing
+  - type: cost
+    provider: digitalocean
    refresh_interval: 3600
    model_aliases:
      openai-gpt-4o: openai/gpt-4o
      openai-gpt-4o-mini: openai/gpt-4o-mini
      anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514

-  # Use cost_metrics instead of digitalocean_pricing to supply your own pricing data.
-  # The demo metrics_server.py exposes /costs with OpenAI and Anthropic pricing.
-  # - type: cost_metrics
-  #   url: http://localhost:8080/costs
-  #   refresh_interval: 300
-
-  - type: prometheus_metrics
+  - type: latency
+    provider: prometheus
    url: http://localhost:9090
    query: model_latency_p95_seconds
    refresh_interval: 60
--- a/docs/routing-api.md
+++ b/docs/routing-api.md
@ -134,19 +134,12 @@ routing_preferences:

 # Optional: live cost and latency data sources (max one per type)
 model_metrics_sources:
-  # Option A: DigitalOcean public pricing (no auth required)
-  - type: digitalocean_pricing
+  - type: cost
+    provider: digitalocean
    refresh_interval: 3600

-  # Option B: custom cost endpoint (mutually exclusive with digitalocean_pricing)
-  # - type: cost_metrics
-  #   url: https://internal-cost-api/models
-  #   refresh_interval: 300  # seconds; omit for fetch-once on startup
-  #   auth:
-  #     type: bearer
-  #     token: $COST_API_TOKEN
-
-  - type: prometheus_metrics
+  - type: latency
+    provider: prometheus
    url: https://internal-prometheus/
    query: histogram_quantile(0.95, sum by (model_name, le) (rate(model_latency_seconds_bucket[5m])))
    refresh_interval: 60
@ -158,47 +151,21 @@ Plano validates metric source configuration at startup and exits with a clear er

 | Condition | Error |
 |---|---|
-| `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost data source — add cost_metrics or digitalocean_pricing` |
-| `prefer: fastest` with no `prometheus_metrics` | `prefer: fastest requires a prometheus_metrics source` |
-| Two `cost_metrics` entries | `only one cost_metrics source is allowed` |
-| Two `prometheus_metrics` entries | `only one prometheus_metrics source is allowed` |
-| Two `digitalocean_pricing` entries | `only one digitalocean_pricing source is allowed` |
-| `cost_metrics` and `digitalocean_pricing` both present | `cannot both be configured — use one or the other` |
+| `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost metrics source` |
+| `prefer: fastest` with no latency source | `prefer: fastest requires a latency metrics source` |
+| Two `type: cost` entries | `only one cost metrics source is allowed` |
+| Two `type: latency` entries | `only one latency metrics source is allowed` |

 If a model listed in `routing_preferences` has no matching entry in the fetched pricing or latency data, Plano logs a `WARN` at startup — the model is still included but ranked last. The same warning is also emitted per routing request when a model has no data in cache at decision time (relevant for inline `routing_preferences` overrides that reference models not covered by the configured metrics sources).

-### cost_metrics endpoint
-
-Plano GETs `url` on startup (and on each `refresh_interval`). Expected response — a JSON object mapping model name to an object with `input_per_million` and `output_per_million` fields:
-
-```json
-{
-  "anthropic/claude-sonnet-4-20250514": {
-    "input_per_million": 3.0,
-    "output_per_million": 15.0
-  },
-  "openai/gpt-4o": {
-    "input_per_million": 5.0,
-    "output_per_million": 20.0
-  },
-  "openai/gpt-4o-mini": {
-    "input_per_million": 0.15,
-    "output_per_million": 0.6
-  }
-}
-```
-
- `auth.type: bearer` adds `Authorization: Bearer <token>` to the request
- Plano combines the two fields as `input_per_million + output_per_million` to produce a single cost scalar used for ranking
- Only relative order matters — the unit (e.g. USD per million tokens) is consistent so ranking is correct
-
-### digitalocean_pricing source
+### Cost metrics (provider: digitalocean)

 Fetches public model pricing from the DigitalOcean Gen-AI catalog. No authentication required.

 ```yaml
 model_metrics_sources:
-  - type: digitalocean_pricing
+  - type: cost
+    provider: digitalocean
    refresh_interval: 3600   # re-fetch every hour; omit to fetch once on startup
    model_aliases:
      openai-gpt-4o: openai/gpt-4o
@ -211,10 +178,9 @@ DO catalog entries are stored by their `model_id` field (e.g. `openai-gpt-4o`).
 **`model_aliases`** — optional. Maps DO `model_id` values to the model names used in `routing_preferences`. Without aliases, cost data is stored under the DO model_id (e.g. `openai-gpt-4o`), which won't match models configured as `openai/gpt-4o`. Aliases let you bridge the naming gap without changing your routing config.

 **Constraints:**
- `cost_metrics` and `digitalocean_pricing` cannot both be configured — use one or the other.
- Only one `digitalocean_pricing` entry is allowed.
+- Only one `type: cost` entry is allowed.

-### prometheus_metrics endpoint
+### Latency metrics (provider: prometheus)

 Plano queries `{url}/api/v1/query?query={query}` on startup and each `refresh_interval`. The PromQL expression must return an instant vector with a `model_name` label: