diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index 6ee28a83..276f7699 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -512,52 +512,11 @@ properties: properties: type: type: string - const: cost_metrics - url: + const: cost + provider: type: string - refresh_interval: - type: integer - minimum: 1 - auth: - type: object - properties: - type: - type: string - enum: - - bearer - token: - type: string - required: - - type - - token - additionalProperties: false - required: - - type - - url - additionalProperties: false - - type: object - properties: - type: - type: string - const: prometheus_metrics - url: - type: string - query: - type: string - refresh_interval: - type: integer - minimum: 1 - description: "Refresh interval in seconds" - required: - - type - - url - - query - additionalProperties: false - - type: object - properties: - type: - type: string - const: digitalocean_pricing + enum: + - digitalocean refresh_interval: type: integer minimum: 1 @@ -569,6 +528,30 @@ properties: type: string required: - type + - provider + additionalProperties: false + - type: object + properties: + type: + type: string + const: latency + provider: + type: string + enum: + - prometheus + url: + type: string + query: + type: string + refresh_interval: + type: integer + minimum: 1 + description: "Refresh interval in seconds" + required: + - type + - provider + - url + - query additionalProperties: false additionalProperties: false diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 19ef9efb..bc88b60b 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -216,33 +216,17 @@ async fn init_app_state( use common::configuration::MetricsSource; let cost_count = sources .iter() - .filter(|s| matches!(s, MetricsSource::CostMetrics { .. })) + .filter(|s| matches!(s, MetricsSource::Cost(_))) .count(); - let prom_count = sources + let latency_count = sources .iter() - .filter(|s| matches!(s, MetricsSource::PrometheusMetrics { .. })) - .count(); - let do_count = sources - .iter() - .filter(|s| matches!(s, MetricsSource::DigitalOceanPricing { .. })) + .filter(|s| matches!(s, MetricsSource::Latency(_))) .count(); if cost_count > 1 { - return Err("model_metrics_sources: only one cost_metrics source is allowed".into()); + return Err("model_metrics_sources: only one cost metrics source is allowed".into()); } - if prom_count > 1 { - return Err( - "model_metrics_sources: only one prometheus_metrics source is allowed".into(), - ); - } - if do_count > 1 { - return Err( - "model_metrics_sources: only one digitalocean_pricing source is allowed".into(), - ); - } - if cost_count > 0 && do_count > 0 { - return Err( - "model_metrics_sources: cost_metrics and digitalocean_pricing cannot both be configured — use one or the other".into(), - ); + if latency_count > 1 { + return Err("model_metrics_sources: only one latency metrics source is allowed".into()); } let svc = ModelMetricsService::new(sources, reqwest::Client::new()).await; Some(Arc::new(svc)) @@ -259,32 +243,27 @@ async fn init_app_state( .as_deref() .unwrap_or_default() .iter() - .any(|s| { - matches!( - s, - MetricsSource::CostMetrics { .. } | MetricsSource::DigitalOceanPricing { .. } - ) - }); - let has_prometheus = config + .any(|s| matches!(s, MetricsSource::Cost(_))); + let has_latency_source = config .model_metrics_sources .as_deref() .unwrap_or_default() .iter() - .any(|s| matches!(s, MetricsSource::PrometheusMetrics { .. })); + .any(|s| matches!(s, MetricsSource::Latency(_))); for pref in prefs { if pref.selection_policy.prefer == SelectionPreference::Cheapest && !has_cost_source { return Err(format!( - "routing_preferences route '{}' uses prefer: cheapest but no cost data source is configured — \ - add cost_metrics or digitalocean_pricing to model_metrics_sources", + "routing_preferences route '{}' uses prefer: cheapest but no cost metrics source is configured — \ + add a cost metrics source to model_metrics_sources", pref.name ) .into()); } - if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_prometheus { + if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_latency_source { return Err(format!( - "routing_preferences route '{}' uses prefer: fastest but no prometheus_metrics source is configured — \ - add prometheus_metrics to model_metrics_sources", + "routing_preferences route '{}' uses prefer: fastest but no latency metrics source is configured — \ + add a latency metrics source to model_metrics_sources", pref.name ) .into()); diff --git a/crates/brightstaff/src/router/model_metrics.rs b/crates/brightstaff/src/router/model_metrics.rs index 2ad40254..ceafe4b8 100644 --- a/crates/brightstaff/src/router/model_metrics.rs +++ b/crates/brightstaff/src/router/model_metrics.rs @@ -2,7 +2,9 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use common::configuration::{MetricsSource, SelectionPolicy, SelectionPreference}; +use common::configuration::{ + CostProvider, LatencyProvider, MetricsSource, SelectionPolicy, SelectionPreference, +}; use tokio::sync::RwLock; use tracing::{info, warn}; @@ -20,81 +22,52 @@ impl ModelMetricsService { for source in sources { match source { - MetricsSource::CostMetrics { - url, - refresh_interval, - auth, - } => { - let data = fetch_cost_metrics(url, auth.as_ref(), &client).await; - info!(models = data.len(), url = %url, "fetched cost metrics"); - *cost_data.write().await = data; + MetricsSource::Cost(cfg) => match cfg.provider { + CostProvider::Digitalocean => { + let aliases = cfg.model_aliases.clone().unwrap_or_default(); + let data = fetch_do_pricing(&client, &aliases).await; + info!(models = data.len(), "fetched digitalocean pricing"); + *cost_data.write().await = data; - if let Some(interval_secs) = refresh_interval { - let cost_clone = Arc::clone(&cost_data); - let client_clone = client.clone(); - let url = url.clone(); - let auth = auth.clone(); - let interval = Duration::from_secs(*interval_secs); - tokio::spawn(async move { - loop { - tokio::time::sleep(interval).await; - let data = - fetch_cost_metrics(&url, auth.as_ref(), &client_clone).await; - info!(models = data.len(), url = %url, "refreshed cost metrics"); - *cost_clone.write().await = data; - } - }); + if let Some(interval_secs) = cfg.refresh_interval { + let cost_clone = Arc::clone(&cost_data); + let client_clone = client.clone(); + let interval = Duration::from_secs(interval_secs); + tokio::spawn(async move { + loop { + tokio::time::sleep(interval).await; + let data = fetch_do_pricing(&client_clone, &aliases).await; + info!(models = data.len(), "refreshed digitalocean pricing"); + *cost_clone.write().await = data; + } + }); + } } - } - MetricsSource::PrometheusMetrics { - url, - query, - refresh_interval, - } => { - let data = fetch_prometheus_metrics(url, query, &client).await; - info!(models = data.len(), url = %url, "fetched prometheus latency metrics"); - *latency_data.write().await = data; + }, + MetricsSource::Latency(cfg) => match cfg.provider { + LatencyProvider::Prometheus => { + let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await; + info!(models = data.len(), url = %cfg.url, "fetched latency metrics"); + *latency_data.write().await = data; - if let Some(interval_secs) = refresh_interval { - let latency_clone = Arc::clone(&latency_data); - let client_clone = client.clone(); - let url = url.clone(); - let query = query.clone(); - let interval = Duration::from_secs(*interval_secs); - tokio::spawn(async move { - loop { - tokio::time::sleep(interval).await; - let data = - fetch_prometheus_metrics(&url, &query, &client_clone).await; - info!(models = data.len(), url = %url, "refreshed prometheus latency metrics"); - *latency_clone.write().await = data; - } - }); + if let Some(interval_secs) = cfg.refresh_interval { + let latency_clone = Arc::clone(&latency_data); + let client_clone = client.clone(); + let url = cfg.url.clone(); + let query = cfg.query.clone(); + let interval = Duration::from_secs(interval_secs); + tokio::spawn(async move { + loop { + tokio::time::sleep(interval).await; + let data = + fetch_prometheus_metrics(&url, &query, &client_clone).await; + info!(models = data.len(), url = %url, "refreshed latency metrics"); + *latency_clone.write().await = data; + } + }); + } } - } - MetricsSource::DigitalOceanPricing { - refresh_interval, - model_aliases, - } => { - let aliases = model_aliases.clone().unwrap_or_default(); - let data = fetch_do_pricing(&client, &aliases).await; - info!(models = data.len(), "fetched digitalocean pricing"); - *cost_data.write().await = data; - - if let Some(interval_secs) = refresh_interval { - let cost_clone = Arc::clone(&cost_data); - let client_clone = client.clone(); - let interval = Duration::from_secs(*interval_secs); - tokio::spawn(async move { - loop { - tokio::time::sleep(interval).await; - let data = fetch_do_pricing(&client_clone, &aliases).await; - info!(models = data.len(), "refreshed digitalocean pricing"); - *cost_clone.write().await = data; - } - }); - } - } + }, } } @@ -160,43 +133,6 @@ fn rank_by_ascending_metric(models: &[String], data: &HashMap) -> V .collect() } -#[derive(serde::Deserialize)] -struct CostEntry { - input_per_million: f64, - output_per_million: f64, -} - -async fn fetch_cost_metrics( - url: &str, - auth: Option<&common::configuration::MetricsAuth>, - client: &reqwest::Client, -) -> HashMap { - let mut req = client.get(url); - if let Some(auth) = auth { - if auth.auth_type == "bearer" { - req = req.header("Authorization", format!("Bearer {}", auth.token)); - } else { - warn!(auth_type = %auth.auth_type, "unsupported auth type for cost_metrics, skipping auth"); - } - } - match req.send().await { - Ok(resp) => match resp.json::>().await { - Ok(data) => data - .into_iter() - .map(|(k, v)| (k, v.input_per_million + v.output_per_million)) - .collect(), - Err(err) => { - warn!(error = %err, url = %url, "failed to parse cost metrics response"); - HashMap::new() - } - }, - Err(err) => { - warn!(error = %err, url = %url, "failed to fetch cost metrics"); - HashMap::new() - } - } -} - #[derive(serde::Deserialize)] struct DoModelList { data: Vec, diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index cd074a8b..167b9c3e 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -127,32 +127,39 @@ pub struct TopLevelRoutingPreference { } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricsAuth { - #[serde(rename = "type")] - pub auth_type: String, // only "bearer" supported - pub token: String, +#[serde(tag = "type", rename_all = "snake_case")] +pub enum MetricsSource { + Cost(CostMetricsConfig), + Latency(LatencyMetricsConfig), } #[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(tag = "type", rename_all = "snake_case")] -pub enum MetricsSource { - CostMetrics { - url: String, - refresh_interval: Option, - auth: Option, - }, - PrometheusMetrics { - url: String, - query: String, - refresh_interval: Option, - }, - #[serde(rename = "digitalocean_pricing")] - DigitalOceanPricing { - refresh_interval: Option, - /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names. - /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o` - model_aliases: Option>, - }, +pub struct CostMetricsConfig { + pub provider: CostProvider, + pub refresh_interval: Option, + /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names. + /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o` + pub model_aliases: Option>, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum CostProvider { + Digitalocean, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LatencyMetricsConfig { + pub provider: LatencyProvider, + pub url: String, + pub query: String, + pub refresh_interval: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum LatencyProvider { + Prometheus, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md index 85306c3b..bffd5e89 100644 --- a/demos/llm_routing/model_routing_service/README.md +++ b/demos/llm_routing/model_routing_service/README.md @@ -46,8 +46,8 @@ routing_preferences: | Value | Behavior | |---|---| -| `cheapest` | Sort models by ascending cost. Requires `cost_metrics` or `digitalocean_pricing` in `model_metrics_sources`. | -| `fastest` | Sort models by ascending P95 latency. Requires `prometheus_metrics` in `model_metrics_sources`. | +| `cheapest` | Sort models by ascending cost. Requires a `type: cost` source in `model_metrics_sources`. | +| `fastest` | Sort models by ascending P95 latency. Requires a `type: latency` source in `model_metrics_sources`. | | `random` | Shuffle the model list on each request. | | `none` | Return models in definition order — no reordering. | @@ -139,23 +139,25 @@ The response contains the ranked model list — your client should try `models[0 ## Metrics Sources -### DigitalOcean Pricing (`digitalocean_pricing`) +### Cost Metrics (provider: digitalocean) Fetches public model pricing from the DigitalOcean Gen-AI catalog (no auth required). Model IDs are normalized as `lowercase(creator)/model_id`. Cost scalar = `input_price_per_million + output_price_per_million`. ```yaml model_metrics_sources: - - type: digitalocean_pricing + - type: cost + provider: digitalocean refresh_interval: 3600 # re-fetch every hour ``` -### Prometheus Latency (`prometheus_metrics`) +### Latency Metrics (provider: prometheus) Queries a Prometheus instance for P95 latency. The PromQL expression must return an instant vector with a `model_name` label matching the model names in `routing_preferences`. ```yaml model_metrics_sources: - - type: prometheus_metrics + - type: latency + provider: prometheus url: http://localhost:9090 query: model_latency_p95_seconds refresh_interval: 60 @@ -163,32 +165,6 @@ model_metrics_sources: The demo's `metrics_server.py` exposes mock latency data; `docker compose up -d` starts it alongside Prometheus. -### Custom Cost Endpoint (`cost_metrics`) - -```yaml -model_metrics_sources: - - type: cost_metrics - url: https://my-internal-pricing-api/costs - auth: - type: bearer - token: $PRICING_TOKEN - refresh_interval: 300 -``` - -Expected response format: -```json -{ - "anthropic/claude-sonnet-4-20250514": { - "input_per_million": 3.0, - "output_per_million": 15.0 - }, - "openai/gpt-4o": { - "input_per_million": 5.0, - "output_per_million": 20.0 - } -} -``` - ## Kubernetes Deployment (Self-hosted Arch-Router on GPU) To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint: diff --git a/demos/llm_routing/model_routing_service/config.yaml b/demos/llm_routing/model_routing_service/config.yaml index 6f20134b..dcd26ce8 100644 --- a/demos/llm_routing/model_routing_service/config.yaml +++ b/demos/llm_routing/model_routing_service/config.yaml @@ -34,20 +34,16 @@ routing_preferences: prefer: fastest model_metrics_sources: - - type: digitalocean_pricing + - type: cost + provider: digitalocean refresh_interval: 3600 model_aliases: openai-gpt-4o: openai/gpt-4o openai-gpt-4o-mini: openai/gpt-4o-mini anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514 - # Use cost_metrics instead of digitalocean_pricing to supply your own pricing data. - # The demo metrics_server.py exposes /costs with OpenAI and Anthropic pricing. - # - type: cost_metrics - # url: http://localhost:8080/costs - # refresh_interval: 300 - - - type: prometheus_metrics + - type: latency + provider: prometheus url: http://localhost:9090 query: model_latency_p95_seconds refresh_interval: 60 diff --git a/docs/routing-api.md b/docs/routing-api.md index f3ed0552..4954c938 100644 --- a/docs/routing-api.md +++ b/docs/routing-api.md @@ -134,19 +134,12 @@ routing_preferences: # Optional: live cost and latency data sources (max one per type) model_metrics_sources: - # Option A: DigitalOcean public pricing (no auth required) - - type: digitalocean_pricing + - type: cost + provider: digitalocean refresh_interval: 3600 - # Option B: custom cost endpoint (mutually exclusive with digitalocean_pricing) - # - type: cost_metrics - # url: https://internal-cost-api/models - # refresh_interval: 300 # seconds; omit for fetch-once on startup - # auth: - # type: bearer - # token: $COST_API_TOKEN - - - type: prometheus_metrics + - type: latency + provider: prometheus url: https://internal-prometheus/ query: histogram_quantile(0.95, sum by (model_name, le) (rate(model_latency_seconds_bucket[5m]))) refresh_interval: 60 @@ -158,47 +151,21 @@ Plano validates metric source configuration at startup and exits with a clear er | Condition | Error | |---|---| -| `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost data source — add cost_metrics or digitalocean_pricing` | -| `prefer: fastest` with no `prometheus_metrics` | `prefer: fastest requires a prometheus_metrics source` | -| Two `cost_metrics` entries | `only one cost_metrics source is allowed` | -| Two `prometheus_metrics` entries | `only one prometheus_metrics source is allowed` | -| Two `digitalocean_pricing` entries | `only one digitalocean_pricing source is allowed` | -| `cost_metrics` and `digitalocean_pricing` both present | `cannot both be configured — use one or the other` | +| `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost metrics source` | +| `prefer: fastest` with no latency source | `prefer: fastest requires a latency metrics source` | +| Two `type: cost` entries | `only one cost metrics source is allowed` | +| Two `type: latency` entries | `only one latency metrics source is allowed` | If a model listed in `routing_preferences` has no matching entry in the fetched pricing or latency data, Plano logs a `WARN` at startup — the model is still included but ranked last. The same warning is also emitted per routing request when a model has no data in cache at decision time (relevant for inline `routing_preferences` overrides that reference models not covered by the configured metrics sources). -### cost_metrics endpoint - -Plano GETs `url` on startup (and on each `refresh_interval`). Expected response — a JSON object mapping model name to an object with `input_per_million` and `output_per_million` fields: - -```json -{ - "anthropic/claude-sonnet-4-20250514": { - "input_per_million": 3.0, - "output_per_million": 15.0 - }, - "openai/gpt-4o": { - "input_per_million": 5.0, - "output_per_million": 20.0 - }, - "openai/gpt-4o-mini": { - "input_per_million": 0.15, - "output_per_million": 0.6 - } -} -``` - -- `auth.type: bearer` adds `Authorization: Bearer ` to the request -- Plano combines the two fields as `input_per_million + output_per_million` to produce a single cost scalar used for ranking -- Only relative order matters — the unit (e.g. USD per million tokens) is consistent so ranking is correct - -### digitalocean_pricing source +### Cost metrics (provider: digitalocean) Fetches public model pricing from the DigitalOcean Gen-AI catalog. No authentication required. ```yaml model_metrics_sources: - - type: digitalocean_pricing + - type: cost + provider: digitalocean refresh_interval: 3600 # re-fetch every hour; omit to fetch once on startup model_aliases: openai-gpt-4o: openai/gpt-4o @@ -211,10 +178,9 @@ DO catalog entries are stored by their `model_id` field (e.g. `openai-gpt-4o`). **`model_aliases`** — optional. Maps DO `model_id` values to the model names used in `routing_preferences`. Without aliases, cost data is stored under the DO model_id (e.g. `openai-gpt-4o`), which won't match models configured as `openai/gpt-4o`. Aliases let you bridge the naming gap without changing your routing config. **Constraints:** -- `cost_metrics` and `digitalocean_pricing` cannot both be configured — use one or the other. -- Only one `digitalocean_pricing` entry is allowed. +- Only one `type: cost` entry is allowed. -### prometheus_metrics endpoint +### Latency metrics (provider: prometheus) Plano queries `{url}/api/v1/query?query={query}` on startup and each `refresh_interval`. The PromQL expression must return an instant vector with a `model_name` label: