diff --git a/config/plano_config_schema.yaml b/config/plano_config_schema.yaml index 5270eb48..4d1805ab 100644 --- a/config/plano_config_schema.yaml +++ b/config/plano_config_schema.yaml @@ -548,11 +548,24 @@ properties: refresh_interval: type: integer minimum: 1 + description: "Refresh interval in seconds" required: - type - url - query additionalProperties: false + - type: object + properties: + type: + type: string + const: digitalocean_pricing + refresh_interval: + type: integer + minimum: 1 + description: "Refresh interval in seconds" + required: + - type + additionalProperties: false additionalProperties: false required: diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 83404233..24a60c14 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -220,6 +220,10 @@ async fn init_app_state( .iter() .filter(|s| matches!(s, MetricsSource::PrometheusMetrics { .. })) .count(); + let do_count = sources + .iter() + .filter(|s| matches!(s, MetricsSource::DigitalOceanPricing { .. })) + .count(); if cost_count > 1 { return Err("model_metrics_sources: only one cost_metrics source is allowed".into()); } @@ -228,12 +232,87 @@ async fn init_app_state( "model_metrics_sources: only one prometheus_metrics source is allowed".into(), ); } + if do_count > 1 { + return Err( + "model_metrics_sources: only one digitalocean_pricing source is allowed".into(), + ); + } + if cost_count > 0 && do_count > 0 { + return Err( + "model_metrics_sources: cost_metrics and digitalocean_pricing cannot both be configured — use one or the other".into(), + ); + } let svc = ModelMetricsService::new(sources, reqwest::Client::new()).await; Some(Arc::new(svc)) } else { None }; + // Validate that selection_policy.prefer is compatible with the configured metric sources. + if let Some(ref prefs) = config.routing_preferences { + use common::configuration::{MetricsSource, SelectionPreference}; + + let has_cost_source = config + .model_metrics_sources + .as_deref() + .unwrap_or_default() + .iter() + .any(|s| { + matches!( + s, + MetricsSource::CostMetrics { .. } | MetricsSource::DigitalOceanPricing { .. } + ) + }); + let has_prometheus = config + .model_metrics_sources + .as_deref() + .unwrap_or_default() + .iter() + .any(|s| matches!(s, MetricsSource::PrometheusMetrics { .. })); + + for pref in prefs { + if pref.selection_policy.prefer == SelectionPreference::Cheapest && !has_cost_source { + return Err(format!( + "routing_preferences route '{}' uses prefer: cheapest but no cost data source is configured — \ + add cost_metrics or digitalocean_pricing to model_metrics_sources", + pref.name + ) + .into()); + } + if pref.selection_policy.prefer == SelectionPreference::Fastest && !has_prometheus { + return Err(format!( + "routing_preferences route '{}' uses prefer: fastest but no prometheus_metrics source is configured — \ + add prometheus_metrics to model_metrics_sources", + pref.name + ) + .into()); + } + } + } + + // Warn about models in routing_preferences that have no matching pricing/latency data. + if let (Some(ref prefs), Some(ref svc)) = (&config.routing_preferences, &metrics_service) { + let cost_data = svc.cost_snapshot().await; + let latency_data = svc.latency_snapshot().await; + for pref in prefs { + use common::configuration::SelectionPreference; + for model in &pref.models { + let missing = match pref.selection_policy.prefer { + SelectionPreference::Cheapest => !cost_data.contains_key(model.as_str()), + SelectionPreference::Fastest => !latency_data.contains_key(model.as_str()), + _ => false, + }; + if missing { + warn!( + model = %model, + route = %pref.name, + "model has no metric data — will be ranked last" + ); + } + } + } + } + let router_service = Arc::new(RouterService::new( config.routing_preferences.clone(), metrics_service, diff --git a/crates/brightstaff/src/router/model_metrics.rs b/crates/brightstaff/src/router/model_metrics.rs index 5561118d..078b8938 100644 --- a/crates/brightstaff/src/router/model_metrics.rs +++ b/crates/brightstaff/src/router/model_metrics.rs @@ -6,6 +6,8 @@ use common::configuration::{MetricsSource, SelectionPolicy, SelectionPreference} use tokio::sync::RwLock; use tracing::{info, warn}; +const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models"; + pub struct ModelMetricsService { cost: Arc>>, latency: Arc>>, @@ -70,6 +72,25 @@ impl ModelMetricsService { }); } } + MetricsSource::DigitalOceanPricing { refresh_interval } => { + let data = fetch_do_pricing(&client).await; + info!(models = data.len(), "fetched digitalocean pricing"); + *cost_data.write().await = data; + + if let Some(interval_secs) = refresh_interval { + let cost_clone = Arc::clone(&cost_data); + let client_clone = client.clone(); + let interval = Duration::from_secs(*interval_secs); + tokio::spawn(async move { + loop { + tokio::time::sleep(interval).await; + let data = fetch_do_pricing(&client_clone).await; + info!(models = data.len(), "refreshed digitalocean pricing"); + *cost_clone.write().await = data; + } + }); + } + } } } @@ -95,6 +116,16 @@ impl ModelMetricsService { SelectionPreference::None => models.to_vec(), } } + + /// Returns a snapshot of the current cost data. Used at startup to warn about unmatched models. + pub async fn cost_snapshot(&self) -> HashMap { + self.cost.read().await.clone() + } + + /// Returns a snapshot of the current latency data. Used at startup to warn about unmatched models. + pub async fn latency_snapshot(&self) -> HashMap { + self.latency.read().await.clone() + } } fn rank_by_ascending_metric(models: &[String], data: &HashMap) -> Vec { @@ -134,6 +165,12 @@ fn shuffle(models: &[String]) -> Vec { result } +#[derive(serde::Deserialize)] +struct CostEntry { + input_per_million: f64, + output_per_million: f64, +} + async fn fetch_cost_metrics( url: &str, auth: Option<&common::configuration::MetricsAuth>, @@ -148,8 +185,11 @@ async fn fetch_cost_metrics( } } match req.send().await { - Ok(resp) => match resp.json::>().await { - Ok(data) => data, + Ok(resp) => match resp.json::>().await { + Ok(data) => data + .into_iter() + .map(|(k, v)| (k, v.input_per_million + v.output_per_million)) + .collect(), Err(err) => { warn!(error = %err, url = %url, "failed to parse cost metrics response"); HashMap::new() @@ -162,6 +202,49 @@ async fn fetch_cost_metrics( } } +#[derive(serde::Deserialize)] +struct DoModelList { + data: Vec, +} + +#[derive(serde::Deserialize)] +struct DoModel { + model_id: String, + creator: String, + pricing: DoPricing, +} + +#[derive(serde::Deserialize)] +struct DoPricing { + input_price_per_million: f64, + output_price_per_million: f64, +} + +async fn fetch_do_pricing(client: &reqwest::Client) -> HashMap { + match client.get(DO_PRICING_URL).send().await { + Ok(resp) => match resp.json::().await { + Ok(list) => list + .data + .into_iter() + .map(|m| { + let key = format!("{}/{}", m.creator.to_lowercase(), m.model_id); + let cost = + m.pricing.input_price_per_million + m.pricing.output_price_per_million; + (key, cost) + }) + .collect(), + Err(err) => { + warn!(error = %err, url = DO_PRICING_URL, "failed to parse digitalocean pricing response"); + HashMap::new() + } + }, + Err(err) => { + warn!(error = %err, url = DO_PRICING_URL, "failed to fetch digitalocean pricing"); + HashMap::new() + } + } +} + #[derive(serde::Deserialize)] struct PrometheusResponse { data: PrometheusData, diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index cab20845..1e4c1985 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -147,6 +147,9 @@ pub enum MetricsSource { query: String, refresh_interval: Option, }, + DigitalOceanPricing { + refresh_interval: Option, + }, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md index 72b672f3..85306c3b 100644 --- a/demos/llm_routing/model_routing_service/README.md +++ b/demos/llm_routing/model_routing_service/README.md @@ -13,42 +13,60 @@ Plano is an AI-native proxy and data plane for agentic apps — with built-in or - **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover - **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request +- **Cost & latency ranking** — models are ranked by live cost (DigitalOcean pricing API) or latency (Prometheus) before returning the fallback list - **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code - **Runs anywhere** — single binary; self-host the router for full data privacy ## How Routing Works -The entire routing configuration is plain YAML — no code: +Routing is configured in top-level `routing_preferences` (requires `version: v0.4.0`): ```yaml -model_providers: - - model: openai/gpt-4o-mini - default: true # fallback for unmatched requests +version: v0.4.0 - - model: openai/gpt-4o - routing_preferences: - - name: complex_reasoning - description: complex reasoning tasks, multi-step analysis +routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis, or detailed explanations + models: + - openai/gpt-4o + - openai/gpt-4o-mini + selection_policy: + prefer: cheapest # rank by live cost data - - model: anthropic/claude-sonnet-4-20250514 - routing_preferences: - - name: code_generation - description: generating new code, writing functions + - name: code_generation + description: generating new code, writing functions, or creating boilerplate + models: + - anthropic/claude-sonnet-4-20250514 + - openai/gpt-4o + selection_policy: + prefer: fastest # rank by Prometheus p95 latency ``` -When a request arrives, Plano sends the conversation and routing preferences to Arch-Router, which classifies the intent and returns the matching route: +### `selection_policy.prefer` values + +| Value | Behavior | +|---|---| +| `cheapest` | Sort models by ascending cost. Requires `cost_metrics` or `digitalocean_pricing` in `model_metrics_sources`. | +| `fastest` | Sort models by ascending P95 latency. Requires `prometheus_metrics` in `model_metrics_sources`. | +| `random` | Shuffle the model list on each request. | +| `none` | Return models in definition order — no reordering. | + +When a request arrives, Plano: + +1. Sends the conversation + route descriptions to Arch-Router for intent classification +2. Looks up the matched route and ranks its candidate models by cost or latency +3. Returns an ordered list — client uses `models[0]`, falls back to `models[1]` on 429/5xx ``` 1. Request arrives → "Write binary search in Python" -2. Preferences serialized → [{"name":"code_generation", ...}, {"name":"complex_reasoning", ...}] -3. Arch-Router classifies → {"route": "code_generation"} -4. Route → Model lookup → code_generation → anthropic/claude-sonnet-4-20250514 -5. Request forwarded → Claude generates the response +2. Arch-Router classifies → route: "code_generation" +3. Rank by latency → claude-sonnet (0.85s) < gpt-4o (1.2s) +4. Response → models: ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"] ``` -No match? Arch-Router returns `other` → Plano falls back to the default model. +No match? Arch-Router returns `null` route → client falls back to the model in the original request. -The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing and validating routing behavior before going to production. +The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing routing behavior before going to production. ## Setup @@ -59,12 +77,28 @@ export OPENAI_API_KEY= export ANTHROPIC_API_KEY= ``` -Start Plano: +Start Prometheus and the mock latency metrics server: + ```bash cd demos/llm_routing/model_routing_service +docker compose up -d +``` + +Then start Plano: + +```bash planoai up config.yaml ``` +On startup you should see logs like: + +``` +fetched digitalocean pricing: N models +fetched prometheus latency metrics: 3 models +``` + +If a model in `routing_preferences` has no matching pricing or latency data, Plano logs a warning at startup — the model is still included but ranked last. + ## Run the demo ```bash @@ -95,13 +129,65 @@ curl http://localhost:12000/routing/v1/chat/completions \ Response: ```json { - "model": "anthropic/claude-sonnet-4-20250514", + "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o"], "route": "code_generation", "trace_id": "c16d1096c1af4a17abb48fb182918a88" } ``` -The response tells you which model would handle this request and which route was matched, without actually making the LLM call. +The response contains the ranked model list — your client should try `models[0]` first and fall back to `models[1]` on 429 or 5xx errors. + +## Metrics Sources + +### DigitalOcean Pricing (`digitalocean_pricing`) + +Fetches public model pricing from the DigitalOcean Gen-AI catalog (no auth required). Model IDs are normalized as `lowercase(creator)/model_id`. Cost scalar = `input_price_per_million + output_price_per_million`. + +```yaml +model_metrics_sources: + - type: digitalocean_pricing + refresh_interval: 3600 # re-fetch every hour +``` + +### Prometheus Latency (`prometheus_metrics`) + +Queries a Prometheus instance for P95 latency. The PromQL expression must return an instant vector with a `model_name` label matching the model names in `routing_preferences`. + +```yaml +model_metrics_sources: + - type: prometheus_metrics + url: http://localhost:9090 + query: model_latency_p95_seconds + refresh_interval: 60 +``` + +The demo's `metrics_server.py` exposes mock latency data; `docker compose up -d` starts it alongside Prometheus. + +### Custom Cost Endpoint (`cost_metrics`) + +```yaml +model_metrics_sources: + - type: cost_metrics + url: https://my-internal-pricing-api/costs + auth: + type: bearer + token: $PRICING_TOKEN + refresh_interval: 300 +``` + +Expected response format: +```json +{ + "anthropic/claude-sonnet-4-20250514": { + "input_per_million": 3.0, + "output_per_million": 15.0 + }, + "openai/gpt-4o": { + "input_per_million": 5.0, + "output_per_million": 20.0 + } +} +``` ## Kubernetes Deployment (Self-hosted Arch-Router on GPU) @@ -119,7 +205,6 @@ GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment **1. Deploy Arch-Router and Plano:** ```bash - # arch-router deployment kubectl apply -f vllm-deployment.yaml @@ -165,39 +250,3 @@ kubectl create configmap plano-config \ --dry-run=client -o yaml | kubectl apply -f - kubectl rollout restart deployment/plano ``` - -## Demo Output - -``` -=== Model Routing Service Demo === - ---- 1. Code generation query (OpenAI format) --- -{ - "model": "anthropic/claude-sonnet-4-20250514", - "route": "code_generation", - "trace_id": "c16d1096c1af4a17abb48fb182918a88" -} - ---- 2. Complex reasoning query (OpenAI format) --- -{ - "model": "openai/gpt-4o", - "route": "complex_reasoning", - "trace_id": "30795e228aff4d7696f082ed01b75ad4" -} - ---- 3. Simple query - no routing match (OpenAI format) --- -{ - "model": "none", - "route": null, - "trace_id": "ae0b6c3b220d499fb5298ac63f4eac0e" -} - ---- 4. Code generation query (Anthropic format) --- -{ - "model": "anthropic/claude-sonnet-4-20250514", - "route": "code_generation", - "trace_id": "26be822bbdf14a3ba19fe198e55ea4a9" -} - -=== Demo Complete === -``` diff --git a/demos/llm_routing/model_routing_service/config.yaml b/demos/llm_routing/model_routing_service/config.yaml index 7b98b25b..34ae2f50 100644 --- a/demos/llm_routing/model_routing_service/config.yaml +++ b/demos/llm_routing/model_routing_service/config.yaml @@ -1,4 +1,4 @@ -version: v0.3.0 +version: v0.4.0 listeners: - type: model @@ -6,22 +6,41 @@ listeners: port: 12000 model_providers: - - model: openai/gpt-4o-mini access_key: $OPENAI_API_KEY default: true - model: openai/gpt-4o access_key: $OPENAI_API_KEY - routing_preferences: - - name: complex_reasoning - description: complex reasoning tasks, multi-step analysis, or detailed explanations - model: anthropic/claude-sonnet-4-20250514 access_key: $ANTHROPIC_API_KEY - routing_preferences: - - name: code_generation - description: generating new code, writing functions, or creating boilerplate + +routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis, or detailed explanations + models: + - openai/gpt-4o + - openai/gpt-4o-mini + selection_policy: + prefer: cheapest + + - name: code_generation + description: generating new code, writing functions, or creating boilerplate + models: + - anthropic/claude-sonnet-4-20250514 + - openai/gpt-4o + selection_policy: + prefer: fastest + +model_metrics_sources: + - type: digitalocean_pricing + refresh_interval: 3600 + + - type: prometheus_metrics + url: http://localhost:9090 + query: model_latency_p95_seconds + refresh_interval: 60 tracing: random_sampling: 100 diff --git a/demos/llm_routing/model_routing_service/docker-compose.yaml b/demos/llm_routing/model_routing_service/docker-compose.yaml new file mode 100644 index 00000000..0f058e7b --- /dev/null +++ b/demos/llm_routing/model_routing_service/docker-compose.yaml @@ -0,0 +1,17 @@ +services: + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yml:ro + depends_on: + - model-metrics + + model-metrics: + image: python:3.11-slim + ports: + - "8080:8080" + volumes: + - ./metrics_server.py:/metrics_server.py:ro + command: python /metrics_server.py diff --git a/demos/llm_routing/model_routing_service/metrics_server.py b/demos/llm_routing/model_routing_service/metrics_server.py new file mode 100644 index 00000000..a7a22081 --- /dev/null +++ b/demos/llm_routing/model_routing_service/metrics_server.py @@ -0,0 +1,30 @@ +""" +Minimal Prometheus metrics server for demo purposes. +Exposes mock P95 latency data for model routing. +""" +from http.server import HTTPServer, BaseHTTPRequestHandler + +METRICS = """\ +# HELP model_latency_p95_seconds P95 request latency in seconds per model +# TYPE model_latency_p95_seconds gauge +model_latency_p95_seconds{model_name="anthropic/claude-sonnet-4-20250514"} 0.85 +model_latency_p95_seconds{model_name="openai/gpt-4o"} 1.20 +model_latency_p95_seconds{model_name="openai/gpt-4o-mini"} 0.40 +""".encode() + + +class MetricsHandler(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + self.end_headers() + self.wfile.write(METRICS) + + def log_message(self, fmt, *args): + pass # suppress access logs + + +if __name__ == "__main__": + server = HTTPServer(("", 8080), MetricsHandler) + print("metrics server listening on :8080", flush=True) + server.serve_forever() diff --git a/demos/llm_routing/model_routing_service/prometheus.yaml b/demos/llm_routing/model_routing_service/prometheus.yaml new file mode 100644 index 00000000..6b0091fc --- /dev/null +++ b/demos/llm_routing/model_routing_service/prometheus.yaml @@ -0,0 +1,8 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: model_latency + static_configs: + - targets: + - model-metrics:8080 diff --git a/docs/routing-api.md b/docs/routing-api.md index 6ba0524e..f4d75803 100644 --- a/docs/routing-api.md +++ b/docs/routing-api.md @@ -135,12 +135,17 @@ routing_preferences: # Optional: live cost and latency data sources (max one per type) model_metrics_sources: - - type: cost_metrics - url: https://internal-cost-api/models - refresh_interval: 300 # seconds; omit for fetch-once on startup - auth: - type: bearer - token: $COST_API_TOKEN + # Option A: DigitalOcean public pricing (no auth required) + - type: digitalocean_pricing + refresh_interval: 3600 + + # Option B: custom cost endpoint (mutually exclusive with digitalocean_pricing) + # - type: cost_metrics + # url: https://internal-cost-api/models + # refresh_interval: 300 # seconds; omit for fetch-once on startup + # auth: + # type: bearer + # token: $COST_API_TOKEN - type: prometheus_metrics url: https://internal-prometheus/ @@ -148,20 +153,61 @@ model_metrics_sources: refresh_interval: 60 ``` +### Startup validation + +Plano validates metric source configuration at startup and exits with a clear error if: + +| Condition | Error | +|---|---| +| `prefer: cheapest` with no cost source | `prefer: cheapest requires a cost data source — add cost_metrics or digitalocean_pricing` | +| `prefer: fastest` with no `prometheus_metrics` | `prefer: fastest requires a prometheus_metrics source` | +| Two `cost_metrics` entries | `only one cost_metrics source is allowed` | +| Two `prometheus_metrics` entries | `only one prometheus_metrics source is allowed` | +| Two `digitalocean_pricing` entries | `only one digitalocean_pricing source is allowed` | +| `cost_metrics` and `digitalocean_pricing` both present | `cannot both be configured — use one or the other` | + +If a model listed in `routing_preferences` has no matching entry in the fetched pricing or latency data, Plano logs a `WARN` at startup — the model is still included but ranked last. + ### cost_metrics endpoint -Plano GETs `url` on startup (and on each `refresh_interval`). Expected response — a flat JSON object mapping model name to cost value: +Plano GETs `url` on startup (and on each `refresh_interval`). Expected response — a JSON object mapping model name to an object with `input_per_million` and `output_per_million` fields: ```json { - "anthropic/claude-sonnet-4-20250514": 0.003, - "openai/gpt-4o": 0.005, - "openai/gpt-4o-mini": 0.00015 + "anthropic/claude-sonnet-4-20250514": { + "input_per_million": 3.0, + "output_per_million": 15.0 + }, + "openai/gpt-4o": { + "input_per_million": 5.0, + "output_per_million": 20.0 + }, + "openai/gpt-4o-mini": { + "input_per_million": 0.15, + "output_per_million": 0.6 + } } ``` - `auth.type: bearer` adds `Authorization: Bearer ` to the request -- Cost units are arbitrary (e.g. USD per 1k tokens) — only relative order matters +- Plano combines the two fields as `input_per_million + output_per_million` to produce a single cost scalar used for ranking +- Only relative order matters — the unit (e.g. USD per million tokens) is consistent so ranking is correct + +### digitalocean_pricing source + +Fetches public model pricing from the DigitalOcean Gen-AI catalog. No authentication required. + +```yaml +model_metrics_sources: + - type: digitalocean_pricing + refresh_interval: 3600 # re-fetch every hour; omit to fetch once on startup +``` + +Model IDs are normalized as `lowercase(creator)/model_id` — for example, `creator: "OpenAI"`, `model_id: "openai-gpt-4o"` → `"openai/openai-gpt-4o"`. The cost scalar is `input_price_per_million + output_price_per_million`. + +**Constraints:** +- `cost_metrics` and `digitalocean_pricing` cannot both be configured — use one or the other. +- Only one `digitalocean_pricing` entry is allowed. ### prometheus_metrics endpoint