feat: make model pricing source configurable (models.dev + DigitalOcean) (#971)

This commit is contained in:
Musa 2026-06-24 10:14:12 -07:00 committed by GitHub
parent 5cc4c4ee77
commit 558df0307c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 687 additions and 48 deletions

View file

@ -1,7 +1,8 @@
"""DigitalOcean Gradient pricing catalog for the obs console. """Model pricing catalog for the obs console.
Ported loosely from ``crates/brightstaff/src/router/model_metrics.rs::fetch_do_pricing``. Mirrors ``crates/brightstaff/src/router/model_metrics.rs``. The source is
Single-source: one fetch at startup, cached for the life of the process. configurable: ``digitalocean`` (DO GenAI catalog) or ``models.dev``. A single
fetch at startup is cached for the life of the process.
""" """
from __future__ import annotations from __future__ import annotations
@ -14,7 +15,18 @@ from typing import Any
import requests import requests
DEFAULT_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog" DO_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog"
MODELS_DEV_URL = "https://models.dev/api.json"
# Backwards-compatible default (DigitalOcean) used when no provider is given.
DEFAULT_PRICING_URL = DO_PRICING_URL
DEFAULT_PRICING_PROVIDER = "digitalocean"
_DEFAULT_URLS = {
"digitalocean": DO_PRICING_URL,
"models.dev": MODELS_DEV_URL,
}
FETCH_TIMEOUT_SECS = 5.0 FETCH_TIMEOUT_SECS = 5.0
@ -51,36 +63,52 @@ class PricingCatalog:
return list(self._prices.keys())[:n] return list(self._prices.keys())[:n]
@classmethod @classmethod
def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog": def fetch(
"""Fetch pricing from DO's catalog endpoint. On failure, returns an cls,
provider: str = DEFAULT_PRICING_PROVIDER,
url: str | None = None,
) -> "PricingCatalog":
"""Fetch pricing from the configured catalog. On failure, returns an
empty catalog (cost column will be blank). empty catalog (cost column will be blank).
The catalog endpoint is public no auth required, no signup so ``provider`` selects the parser/default URL: ``digitalocean`` or
``planoai obs`` gets cost data on first run out of the box. ``models.dev``. Both catalog endpoints are public no auth required
so ``planoai obs`` gets cost data on first run out of the box.
""" """
provider = (provider or DEFAULT_PRICING_PROVIDER).strip().lower()
resolved_url = url or _DEFAULT_URLS.get(provider, DO_PRICING_URL)
try: try:
resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS) resp = requests.get(resolved_url, timeout=FETCH_TIMEOUT_SECS)
resp.raise_for_status() resp.raise_for_status()
data = resp.json() data = resp.json()
except Exception as exc: # noqa: BLE001 — best-effort; never fatal except Exception as exc: # noqa: BLE001 — best-effort; never fatal
logger.warning( logger.warning(
"DO pricing fetch failed: %s; cost column will be blank.", "%s pricing fetch failed: %s; cost column will be blank.",
provider,
exc, exc,
) )
return cls() return cls()
prices = _parse_do_pricing(data) if provider == "models.dev":
prices = _parse_models_dev_pricing(data)
else:
prices = _parse_do_pricing(data)
if not prices: if not prices:
# Dump the first entry's raw shape so we can see which fields DO # Dump a sample of the raw shape so we can see which fields the
# actually returned — helps when the catalog adds new fields or # catalog returned — helps when it adds new fields or the response
# the response doesn't match our parser. # doesn't match our parser.
import json as _json import json as _json
sample_items = _coerce_items(data) if provider == "models.dev" and isinstance(data, dict):
sample = sample_items[0] if sample_items else data sample = next(iter(data.values()), data)
else:
sample_items = _coerce_items(data)
sample = sample_items[0] if sample_items else data
logger.warning( logger.warning(
"DO pricing response had no parseable entries; cost column " "%s pricing response had no parseable entries; cost column "
"will be blank. Sample entry: %s", "will be blank. Sample entry: %s",
provider,
_json.dumps(sample, default=str)[:400], _json.dumps(sample, default=str)[:400],
) )
return cls(prices) return cls(prices)
@ -278,6 +306,75 @@ def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]:
return prices return prices
def _parse_models_dev_pricing(data: Any) -> dict[str, ModelPrice]:
"""Parse a models.dev ``api.json`` response into a ModelPrice map.
models.dev shape (top-level object keyed by provider id)::
{
"anthropic": {
"models": {
"claude-opus-4-5": {
"cost": {"input": 5, "output": 25, "cache_read": 0.5}
}
}
},
...
}
``cost.*`` values are USD per *million* tokens, so we divide by 1e6 to get a
per-token rate. First-party providers use bare model keys, so we register
both ``provider/model`` (matching Plano's routing names) and the bare model
id as a fallback.
"""
prices: dict[str, ModelPrice] = {}
if not isinstance(data, dict):
return prices
for provider_id, provider in data.items():
if not isinstance(provider, dict):
continue
models = provider.get("models")
if not isinstance(models, dict):
continue
for model_key, model in models.items():
if not isinstance(model, dict):
continue
cost = model.get("cost")
if not isinstance(cost, dict):
continue
input_pm = _as_float(cost.get("input"))
output_pm = _as_float(cost.get("output"))
if input_pm is None or output_pm is None:
continue
# Skip 0-rate entries so cost falls back to `—` rather than $0.0000.
if input_pm == 0 and output_pm == 0:
continue
cached_pm = _as_float(cost.get("cache_read"))
price = ModelPrice(
input_per_token_usd=input_pm / 1_000_000,
output_per_token_usd=output_pm / 1_000_000,
cached_input_per_token_usd=(
cached_pm / 1_000_000 if cached_pm is not None else None
),
)
composite = f"{provider_id}/{model_key}"
prices[composite] = price
prices.setdefault(composite.lower(), price)
prices.setdefault(str(model_key), price)
prices.setdefault(str(model_key).lower(), price)
return prices
def _as_float(value: Any) -> float | None:
if value is None:
return None
try:
return float(value)
except (TypeError, ValueError):
return None
def _coerce_items(data: Any) -> list[dict]: def _coerce_items(data: Any) -> list[dict]:
if isinstance(data, list): if isinstance(data, list):
return [x for x in data if isinstance(x, dict)] return [x for x in data if isinstance(x, dict)]

View file

@ -2,9 +2,12 @@
from __future__ import annotations from __future__ import annotations
import logging
import os
import time import time
import rich_click as click import rich_click as click
import yaml
from rich.console import Console from rich.console import Console
from rich.live import Live from rich.live import Live
@ -15,8 +18,50 @@ from planoai.obs.collector import (
LLMCallStore, LLMCallStore,
ObsCollector, ObsCollector,
) )
from planoai.obs.pricing import PricingCatalog from planoai.obs.pricing import DEFAULT_PRICING_PROVIDER, PricingCatalog
from planoai.obs.render import render from planoai.obs.render import render
from planoai.utils import find_config_file
logger = logging.getLogger(__name__)
def _resolve_pricing_source(
config_file: str | None,
provider_override: str | None,
url_override: str | None,
) -> tuple[str, str | None]:
"""Pick the cost pricing source.
Precedence: explicit CLI overrides > the first ``type: cost`` entry in
``model_metrics_sources`` from the Plano config > the DigitalOcean default.
"""
provider = DEFAULT_PRICING_PROVIDER
url: str | None = None
config_path = find_config_file(file=config_file)
if config_path and os.path.exists(config_path):
try:
with open(config_path, "r") as f:
config = yaml.safe_load(f) or {}
sources = config.get("model_metrics_sources") or []
for source in sources:
if isinstance(source, dict) and source.get("type") == "cost":
if source.get("provider"):
provider = str(source["provider"])
if source.get("url"):
url = str(source["url"])
break
except Exception as exc: # noqa: BLE001 — config is optional for obs
logger.warning(
"could not read pricing source from %s: %s", config_path, exc
)
if provider_override:
provider = provider_override
if url_override:
url = url_override
return provider, url
@click.command(name="obs", help="Live observability console for Plano LLM traffic.") @click.command(name="obs", help="Live observability console for Plano LLM traffic.")
@ -48,13 +93,42 @@ from planoai.obs.render import render
show_default=True, show_default=True,
help="TUI refresh interval.", help="TUI refresh interval.",
) )
def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None: @click.option(
"--config",
"config_file",
type=str,
default=None,
help="Path to the Plano config to read the pricing source from "
"(defaults to ./config.yaml or ./plano_config.yaml).",
)
@click.option(
"--pricing-provider",
type=click.Choice(["digitalocean", "models.dev"]),
default=None,
help="Override the cost pricing provider (otherwise read from config).",
)
@click.option(
"--pricing-url",
type=str,
default=None,
help="Override the pricing catalog URL (otherwise read from config / provider default).",
)
def obs(
port: int,
host: str,
capacity: int,
refresh_ms: int,
config_file: str | None,
pricing_provider: str | None,
pricing_url: str | None,
) -> None:
console = Console() console = Console()
provider, url = _resolve_pricing_source(config_file, pricing_provider, pricing_url)
console.print( console.print(
f"[bold {PLANO_COLOR}]planoai obs[/] — loading DO pricing catalog...", f"[bold {PLANO_COLOR}]planoai obs[/] — loading {provider} pricing catalog...",
end="", end="",
) )
pricing = PricingCatalog.fetch() pricing = PricingCatalog.fetch(provider=provider, url=url)
if len(pricing): if len(pricing):
sample = ", ".join(pricing.sample_models(3)) sample = ", ".join(pricing.sample_models(3))
console.print( console.print(
@ -63,7 +137,7 @@ def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
else: else:
console.print( console.print(
" [yellow]no pricing loaded[/] — " " [yellow]no pricing loaded[/] — "
"[dim]cost column will be blank (DO catalog unreachable)[/]" f"[dim]cost column will be blank ({provider} catalog unreachable)[/]"
) )
store = LLMCallStore(capacity=capacity) store = LLMCallStore(capacity=capacity)

View file

@ -144,3 +144,68 @@ def test_parse_do_catalog_divides_large_values_as_per_million():
prices = _parse_do_pricing(sample) prices = _parse_do_pricing(sample)
assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000 assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000
assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000 assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000
_MODELS_DEV_SAMPLE = {
"anthropic": {
"id": "anthropic",
"models": {
"claude-opus-4-5": {
"id": "claude-opus-4-5",
"cost": {"input": 5, "output": 25, "cache_read": 0.5},
}
},
},
"groq": {
"id": "groq",
"models": {
"llama-3.3-70b-versatile": {
"id": "llama-3.3-70b-versatile",
"cost": {"input": 0.59, "output": 0.79},
},
# No cost block → skipped.
"whisper-large-v3-turbo": {"id": "whisper-large-v3-turbo"},
},
},
}
def test_parse_models_dev_composes_provider_keys_and_per_token_rates():
from planoai.obs.pricing import _parse_models_dev_pricing
prices = _parse_models_dev_pricing(_MODELS_DEV_SAMPLE)
# models.dev cost values are per-million → divided by 1e6.
opus = prices["anthropic/claude-opus-4-5"]
assert opus.input_per_token_usd == 5 / 1_000_000
assert opus.output_per_token_usd == 25 / 1_000_000
assert opus.cached_input_per_token_usd == 0.5 / 1_000_000
# Composite provider/model keys match Plano's routing names.
assert "groq/llama-3.3-70b-versatile" in prices
# Bare model id registered as a fallback.
assert "llama-3.3-70b-versatile" in prices
# Models without a cost block are skipped.
assert "groq/whisper-large-v3-turbo" not in prices
def test_models_dev_catalog_cost_computation():
from planoai.obs.pricing import PricingCatalog, _parse_models_dev_pricing
catalog = PricingCatalog(_parse_models_dev_pricing(_MODELS_DEV_SAMPLE))
# 1000 input @ 5e-6 = 0.005; 500 output @ 25e-6 = 0.0125
cost = catalog.cost_for_call(_call("anthropic/claude-opus-4-5", 1000, 500))
assert cost == round(0.005 + 0.0125, 6)
def test_models_dev_skips_zero_rate_entries():
from planoai.obs.pricing import _parse_models_dev_pricing
sample = {
"free": {
"models": {
"promo-model": {"cost": {"input": 0, "output": 0}},
}
}
}
assert _parse_models_dev_pricing(sample) == {}

View file

@ -582,13 +582,17 @@ properties:
type: string type: string
enum: enum:
- digitalocean - digitalocean
- models.dev
url:
type: string
description: "Optional override for the pricing catalog endpoint. Defaults per provider (digitalocean: DO GenAI catalog; models.dev: https://models.dev/api.json)."
refresh_interval: refresh_interval:
type: integer type: integer
minimum: 1 minimum: 1
description: "Refresh interval in seconds" description: "Refresh interval in seconds"
model_aliases: model_aliases:
type: object type: object
description: "Map DO catalog keys (lowercase(creator)/model_id) to Plano model names used in routing_preferences. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'" description: "Map catalog keys to Plano model names used in routing_preferences. DigitalOcean keys are 'lowercase(creator)/model_id'; models.dev keys are 'creator/model_id'. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'"
additionalProperties: additionalProperties:
type: string type: string
required: required:

View file

@ -9,6 +9,7 @@ use tokio::sync::RwLock;
use tracing::{debug, info, warn}; use tracing::{debug, info, warn};
const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog"; const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog";
const MODELS_DEV_URL: &str = "https://models.dev/api.json";
pub struct ModelMetricsService { pub struct ModelMetricsService {
cost: Arc<RwLock<HashMap<String, f64>>>, cost: Arc<RwLock<HashMap<String, f64>>>,
@ -22,28 +23,35 @@ impl ModelMetricsService {
for source in sources { for source in sources {
match source { match source {
MetricsSource::Cost(cfg) => match cfg.provider { MetricsSource::Cost(cfg) => {
CostProvider::Digitalocean => { let provider = cfg.provider.clone();
let aliases = cfg.model_aliases.clone().unwrap_or_default(); let url = cfg
let data = fetch_do_pricing(&client, &aliases).await; .url
info!(models = data.len(), "fetched digitalocean pricing"); .clone()
*cost_data.write().await = data; .unwrap_or_else(|| default_cost_url(&provider).to_string());
let aliases = cfg.model_aliases.clone().unwrap_or_default();
let provider_name = cost_provider_name(&provider);
if let Some(interval_secs) = cfg.refresh_interval { let data = fetch_cost_pricing(&provider, &url, &client, &aliases).await;
let cost_clone = Arc::clone(&cost_data); info!(models = data.len(), provider = provider_name, url = %url, "fetched cost pricing");
let client_clone = client.clone(); *cost_data.write().await = data;
let interval = Duration::from_secs(interval_secs);
tokio::spawn(async move { if let Some(interval_secs) = cfg.refresh_interval {
loop { let cost_clone = Arc::clone(&cost_data);
tokio::time::sleep(interval).await; let client_clone = client.clone();
let data = fetch_do_pricing(&client_clone, &aliases).await; let interval = Duration::from_secs(interval_secs);
info!(models = data.len(), "refreshed digitalocean pricing"); tokio::spawn(async move {
*cost_clone.write().await = data; loop {
} tokio::time::sleep(interval).await;
}); let data =
} fetch_cost_pricing(&provider, &url, &client_clone, &aliases)
.await;
info!(models = data.len(), provider = provider_name, url = %url, "refreshed cost pricing");
*cost_clone.write().await = data;
}
});
} }
}, }
MetricsSource::Latency(cfg) => match cfg.provider { MetricsSource::Latency(cfg) => match cfg.provider {
LatencyProvider::Prometheus => { LatencyProvider::Prometheus => {
let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await; let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await;
@ -165,11 +173,55 @@ struct DoPricing {
output_price_per_million: Option<f64>, output_price_per_million: Option<f64>,
} }
async fn fetch_do_pricing( #[derive(serde::Deserialize)]
struct ModelsDevProvider {
#[serde(default)]
models: HashMap<String, ModelsDevModel>,
}
#[derive(serde::Deserialize)]
struct ModelsDevModel {
cost: Option<ModelsDevCost>,
}
#[derive(serde::Deserialize)]
struct ModelsDevCost {
input: Option<f64>,
output: Option<f64>,
}
fn default_cost_url(provider: &CostProvider) -> &'static str {
match provider {
CostProvider::Digitalocean => DO_PRICING_URL,
CostProvider::ModelsDev => MODELS_DEV_URL,
}
}
fn cost_provider_name(provider: &CostProvider) -> &'static str {
match provider {
CostProvider::Digitalocean => "digitalocean",
CostProvider::ModelsDev => "models.dev",
}
}
async fn fetch_cost_pricing(
provider: &CostProvider,
url: &str,
client: &reqwest::Client, client: &reqwest::Client,
aliases: &HashMap<String, String>, aliases: &HashMap<String, String>,
) -> HashMap<String, f64> { ) -> HashMap<String, f64> {
match client.get(DO_PRICING_URL).send().await { match provider {
CostProvider::Digitalocean => fetch_do_pricing(url, client, aliases).await,
CostProvider::ModelsDev => fetch_models_dev_pricing(url, client, aliases).await,
}
}
async fn fetch_do_pricing(
url: &str,
client: &reqwest::Client,
aliases: &HashMap<String, String>,
) -> HashMap<String, f64> {
match client.get(url).send().await {
Ok(resp) => match resp.json::<DoModelList>().await { Ok(resp) => match resp.json::<DoModelList>().await {
Ok(list) => list Ok(list) => list
.data .data
@ -184,17 +236,66 @@ async fn fetch_do_pricing(
}) })
.collect(), .collect(),
Err(err) => { Err(err) => {
warn!(error = %err, url = DO_PRICING_URL, "failed to parse digitalocean pricing response"); warn!(error = %err, url = %url, "failed to parse digitalocean pricing response");
HashMap::new() HashMap::new()
} }
}, },
Err(err) => { Err(err) => {
warn!(error = %err, url = DO_PRICING_URL, "failed to fetch digitalocean pricing"); warn!(error = %err, url = %url, "failed to fetch digitalocean pricing");
HashMap::new() HashMap::new()
} }
} }
} }
/// models.dev publishes a top-level object keyed by provider id; each provider
/// carries a `models` map whose keys are `creator/model` ids and whose `cost`
/// block holds per-million USD rates. We sum input + output (mirroring the DO
/// ranking metric) and key the result by `creator/model_id` so it lines up with
/// Plano's `provider/model` routing names.
async fn fetch_models_dev_pricing(
url: &str,
client: &reqwest::Client,
aliases: &HashMap<String, String>,
) -> HashMap<String, f64> {
match client.get(url).send().await {
Ok(resp) => match resp.json::<HashMap<String, ModelsDevProvider>>().await {
Ok(providers) => parse_models_dev_pricing(providers, aliases),
Err(err) => {
warn!(error = %err, url = %url, "failed to parse models.dev pricing response");
HashMap::new()
}
},
Err(err) => {
warn!(error = %err, url = %url, "failed to fetch models.dev pricing");
HashMap::new()
}
}
}
fn parse_models_dev_pricing(
providers: HashMap<String, ModelsDevProvider>,
aliases: &HashMap<String, String>,
) -> HashMap<String, f64> {
let mut out = HashMap::new();
for (provider_id, provider) in providers {
for (model_key, model) in provider.models {
let Some(cost) = model.cost else { continue };
let (Some(input), Some(output)) = (cost.input, cost.output) else {
continue;
};
// First-party providers use bare model keys (`claude-opus-4-5`),
// so compose `provider/model` to line up with Plano routing names.
let raw_key = format!("{provider_id}/{model_key}");
let total = input + output;
let key = aliases.get(&raw_key).cloned().unwrap_or(raw_key);
out.insert(key, total);
// Also register the bare model id as a fallback lookup.
out.entry(model_key).or_insert(total);
}
}
out
}
#[derive(serde::Deserialize)] #[derive(serde::Deserialize)]
struct PrometheusResponse { struct PrometheusResponse {
data: PrometheusData, data: PrometheusData,
@ -368,6 +469,50 @@ mod tests {
assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]); assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
} }
#[test]
fn test_parse_models_dev_pricing_composes_provider_keys() {
let json = r#"{
"anthropic": {
"models": {
"claude-opus-4-5": {"cost": {"input": 5.0, "output": 25.0}}
}
},
"groq": {
"models": {
"llama-3.3-70b-versatile": {"cost": {"input": 0.59, "output": 0.79}},
"whisper-large-v3-turbo": {"cost": null}
}
}
}"#;
let providers: HashMap<String, ModelsDevProvider> = serde_json::from_str(json).unwrap();
let aliases = HashMap::new();
let prices = parse_models_dev_pricing(providers, &aliases);
assert_eq!(prices.get("anthropic/claude-opus-4-5"), Some(&30.0));
assert_eq!(prices.get("groq/llama-3.3-70b-versatile"), Some(&1.38));
// bare fallback also registered
assert_eq!(prices.get("claude-opus-4-5"), Some(&30.0));
// models with no cost block are skipped
assert!(!prices.contains_key("groq/whisper-large-v3-turbo"));
}
#[test]
fn test_parse_models_dev_pricing_applies_aliases() {
let json = r#"{
"openai": {"models": {"gpt-oss-120b": {"cost": {"input": 1.0, "output": 2.0}}}}
}"#;
let providers: HashMap<String, ModelsDevProvider> = serde_json::from_str(json).unwrap();
let mut aliases = HashMap::new();
aliases.insert(
"openai/gpt-oss-120b".to_string(),
"openai/gpt-4o".to_string(),
);
let prices = parse_models_dev_pricing(providers, &aliases);
assert_eq!(prices.get("openai/gpt-4o"), Some(&3.0));
assert!(!prices.contains_key("openai/gpt-oss-120b"));
}
#[test] #[test]
fn test_rank_by_ascending_metric_nan_treated_as_missing() { fn test_rank_by_ascending_metric_nan_treated_as_missing() {
let models = vec![ let models = vec![

View file

@ -177,8 +177,13 @@ pub enum MetricsSource {
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CostMetricsConfig { pub struct CostMetricsConfig {
pub provider: CostProvider, pub provider: CostProvider,
/// Optional override for the pricing catalog endpoint. When omitted, a
/// sensible default is used per provider.
pub url: Option<String>,
pub refresh_interval: Option<u64>, pub refresh_interval: Option<u64>,
/// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names. /// Map catalog keys to Plano model names used in `routing_preferences`.
/// DigitalOcean keys look like `lowercase(creator)/model_id`; models.dev
/// keys look like `creator/model_id`.
/// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o` /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
pub model_aliases: Option<HashMap<String, String>>, pub model_aliases: Option<HashMap<String, String>>,
} }
@ -187,6 +192,8 @@ pub struct CostMetricsConfig {
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub enum CostProvider { pub enum CostProvider {
Digitalocean, Digitalocean,
#[serde(rename = "models.dev")]
ModelsDev,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -741,6 +748,51 @@ mod test {
} }
} }
#[test]
fn test_deserialize_models_dev_cost_source() {
let yaml = r#"
- type: cost
provider: models.dev
url: https://models.dev/api.json
refresh_interval: 3600
model_aliases:
openai/gpt-oss-120b: openai/gpt-4o
"#;
let sources: Vec<super::MetricsSource> = serde_yaml::from_str(yaml).unwrap();
assert_eq!(sources.len(), 1);
match &sources[0] {
super::MetricsSource::Cost(cfg) => {
assert!(matches!(cfg.provider, super::CostProvider::ModelsDev));
assert_eq!(cfg.url.as_deref(), Some("https://models.dev/api.json"));
assert_eq!(cfg.refresh_interval, Some(3600));
assert_eq!(
cfg.model_aliases
.as_ref()
.and_then(|m| m.get("openai/gpt-oss-120b"))
.map(String::as_str),
Some("openai/gpt-4o")
);
}
other => panic!("expected cost source, got {other:?}"),
}
}
#[test]
fn test_deserialize_digitalocean_cost_source_without_url() {
let yaml = r#"
- type: cost
provider: digitalocean
"#;
let sources: Vec<super::MetricsSource> = serde_yaml::from_str(yaml).unwrap();
match &sources[0] {
super::MetricsSource::Cost(cfg) => {
assert!(matches!(cfg.provider, super::CostProvider::Digitalocean));
assert_eq!(cfg.url, None);
}
other => panic!("expected cost source, got {other:?}"),
}
}
#[test] #[test]
fn test_into_models_filters_internal_providers() { fn test_into_models_filters_internal_providers() {
let providers = vec![ let providers = vec![

View file

@ -209,6 +209,178 @@ Clients can let the router decide or still specify aliases:
) )
.. _cost_latency_aware_selection:
Cost- and latency-aware selection
---------------------------------
When a route lists more than one candidate model, you can let Plano reorder that
candidate pool using **live cost or latency data** instead of relying solely on the
order you wrote them in. This is controlled per route with ``selection_policy`` and
backed by one or more ``model_metrics_sources``.
This is useful when several models are equally capable for a route and you want Plano
to always reach for the cheapest (or fastest) option first, with the others kept as
fallbacks.
Selection policy
~~~~~~~~~~~~~~~~~
Attach an optional ``selection_policy`` to any entry in ``routing_preferences``:
.. code-block:: yaml
:caption: Per-route selection policy
routing_preferences:
- name: code review
description: reviewing, analyzing, and suggesting improvements to existing code
models:
- anthropic/claude-sonnet-4-5
- groq/llama-3.3-70b-versatile
selection_policy:
prefer: cheapest # cheapest | fastest | none
``prefer`` accepts:
- ``cheapest`` — order candidates by total price (input + output rate) ascending, using a ``cost`` metrics source.
- ``fastest`` — order candidates by observed latency ascending, using a ``latency`` metrics source.
- ``none`` (default) — keep the order you declared; no reordering.
Models that have no data in the selected source are ranked **last**, in their original
order, so routing always degrades gracefully rather than dropping a candidate.
Configuring the pricing source
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``cheapest`` routing needs a price catalog. Plano's **default pricing provider is
DigitalOcean** — its GenAI model catalog is public (no API key, no signup), so cost data
is available out of the box and is what ``planoai obs`` uses if you don't configure
anything. The pricing source is fully swappable: point Plano at `models.dev <https://models.dev/>`_,
or at **any endpoint that exposes a supported pricing structure**.
The ``provider`` field selects which response schema Plano expects (and therefore how it
parses the catalog); the optional ``url`` lets you override the endpoint — for example to
use a mirror, a cached copy, or an internal catalog service that returns the same shape.
.. list-table::
:header-rows: 1
:widths: 18 34 28 20
* - ``provider``
- Default catalog URL
- Key format
- Expected structure
* - ``digitalocean`` *(default)*
- DigitalOcean GenAI model catalog
- ``lowercase(creator)/model_id``
- ``{ data: [ { model_id, pricing: { input_price_per_million, output_price_per_million } } ] }``
* - ``models.dev``
- ``https://models.dev/api.json``
- ``creator/model`` (e.g. ``anthropic/claude-sonnet-4-5``)
- ``{ <provider>: { models: { <model>: { cost: { input, output } } } } }``
Because the source is selected per ``provider``, switching is a one-line change. To stay
on the default DigitalOcean catalog you can omit ``model_metrics_sources`` entirely for
``planoai obs``, or declare it explicitly for routing:
.. code-block:: yaml
:caption: Default cost source (DigitalOcean)
model_metrics_sources:
- type: cost
provider: digitalocean # default; uses the public DO GenAI catalog
To switch to models.dev — an open, community-maintained catalog covering a broad range of
providers and models — change the ``provider`` (and optionally ``url``):
.. code-block:: yaml
:caption: Cost source backed by models.dev
model_metrics_sources:
- type: cost
provider: models.dev # models.dev | digitalocean
url: https://models.dev/api.json # optional; defaults per provider
refresh_interval: 3600 # optional, seconds; refetch on this interval
model_aliases: # optional; see below
openai/gpt-oss-120b: openai/gpt-4o
To use your own endpoint, pick the ``provider`` whose structure your endpoint matches and
override ``url`` — Plano parses the response with that provider's schema:
.. code-block:: yaml
:caption: Custom endpoint exposing the DigitalOcean catalog structure
model_metrics_sources:
- type: cost
provider: digitalocean # selects the DO response schema
url: https://catalog.internal.example.com/pricing
.. note::
The cost metric used for ranking is the sum of the input and output per-million-token
rates — a relative signal for ordering candidates, not a per-request bill. For actual
per-request cost, see the observability console below.
Matching catalog keys to your models
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The router looks up each candidate model by the exact name you use in
``routing_preferences`` (e.g. ``anthropic/claude-sonnet-4-5``). models.dev keys models as
``creator/model``, which lines up with Plano's ``provider/model`` naming, so most models
match automatically.
When a catalog key does not match your model name — for example a version skew, or an
open-weight model you serve under a different provider — use ``model_aliases`` to map the
**catalog key** to the **Plano model name** used in your routing preferences:
.. code-block:: yaml
model_metrics_sources:
- type: cost
provider: models.dev
model_aliases:
# catalog key : plano model name
openai/gpt-oss-120b: openai/gpt-4o
Latency source
~~~~~~~~~~~~~~~
``fastest`` routing reads observed latency from a Prometheus instance. Provide the query
that returns a per-model latency value (lower is faster), labelled by ``model_name``:
.. code-block:: yaml
:caption: Latency source backed by Prometheus
model_metrics_sources:
- type: latency
provider: prometheus
url: http://prometheus:9090
query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m]))
refresh_interval: 60
You can declare both a ``cost`` and a ``latency`` source at the same time; each route
picks whichever it needs based on its ``selection_policy``.
Cost in the observability console
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``planoai obs`` displays a per-request USD cost column derived from the same pricing
catalog. By default it reads the ``cost`` source from your config (the first
``type: cost`` entry under ``model_metrics_sources``); you can also override it on the
command line:
.. code-block:: bash
# Use the cost source from ./config.yaml (default)
planoai obs
# Or override the provider / endpoint explicitly
planoai obs --pricing-provider models.dev
planoai obs --pricing-url https://models.dev/api.json
If no source is configured and no override is given, ``planoai obs`` falls back to the
DigitalOcean catalog so the cost column still populates out of the box.
Plano-Orchestrator Plano-Orchestrator
------------------- -------------------
Plano-Orchestrator is a **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. It delivers production-ready performance with low latency and high accuracy while solving key routing challenges. Plano-Orchestrator is a **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. It delivers production-ready performance with low latency and high accuracy while solving key routing challenges.

View file

@ -86,6 +86,24 @@ routing_preferences:
selection_policy: selection_policy:
prefer: cheapest prefer: cheapest
# model_metrics_sources: external catalogs the router reads to reorder candidate
# models for selection_policy.prefer. A `cost` source ranks `prefer: cheapest`;
# a `latency` source ranks `prefer: fastest`. Both are optional.
model_metrics_sources:
# Cost catalog. provider: models.dev | digitalocean (default url per provider).
- type: cost
provider: models.dev
url: https://models.dev/api.json # optional; omit to use the provider default
refresh_interval: 3600 # optional, seconds
model_aliases: # optional: catalog key -> Plano model name
openai/gpt-oss-120b: openai/gpt-4o
# Latency catalog (Prometheus). Used for selection_policy.prefer: fastest.
- type: latency
provider: prometheus
url: http://prometheus:9090
query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m]))
refresh_interval: 60
# HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access # HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access
listeners: listeners:
# Agent listener for routing requests to multiple agents # Agent listener for routing requests to multiple agents

View file

@ -115,6 +115,18 @@ model_aliases:
target: gpt-4o-mini target: gpt-4o-mini
smart-llm: smart-llm:
target: gpt-4o target: gpt-4o
model_metrics_sources:
- model_aliases:
openai/gpt-oss-120b: openai/gpt-4o
provider: models.dev
refresh_interval: 3600
type: cost
url: https://models.dev/api.json
- provider: prometheus
query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m]))
refresh_interval: 60
type: latency
url: http://prometheus:9090
model_providers: model_providers:
- access_key: $OPENAI_API_KEY - access_key: $OPENAI_API_KEY
default: true default: true