feat: make model pricing source configurable (models.dev + DigitalOcean) (#971)

This commit is contained in:
Musa 2026-06-24 10:14:12 -07:00 committed by GitHub
parent 5cc4c4ee77
commit 558df0307c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 687 additions and 48 deletions

View file

@ -1,7 +1,8 @@
"""DigitalOcean Gradient pricing catalog for the obs console.
"""Model pricing catalog for the obs console.
Ported loosely from ``crates/brightstaff/src/router/model_metrics.rs::fetch_do_pricing``.
Single-source: one fetch at startup, cached for the life of the process.
Mirrors ``crates/brightstaff/src/router/model_metrics.rs``. The source is
configurable: ``digitalocean`` (DO GenAI catalog) or ``models.dev``. A single
fetch at startup is cached for the life of the process.
"""
from __future__ import annotations
@ -14,7 +15,18 @@ from typing import Any
import requests
DEFAULT_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog"
DO_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog"
MODELS_DEV_URL = "https://models.dev/api.json"
# Backwards-compatible default (DigitalOcean) used when no provider is given.
DEFAULT_PRICING_URL = DO_PRICING_URL
DEFAULT_PRICING_PROVIDER = "digitalocean"
_DEFAULT_URLS = {
"digitalocean": DO_PRICING_URL,
"models.dev": MODELS_DEV_URL,
}
FETCH_TIMEOUT_SECS = 5.0
@ -51,36 +63,52 @@ class PricingCatalog:
return list(self._prices.keys())[:n]
@classmethod
def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog":
"""Fetch pricing from DO's catalog endpoint. On failure, returns an
def fetch(
cls,
provider: str = DEFAULT_PRICING_PROVIDER,
url: str | None = None,
) -> "PricingCatalog":
"""Fetch pricing from the configured catalog. On failure, returns an
empty catalog (cost column will be blank).
The catalog endpoint is public no auth required, no signup so
``planoai obs`` gets cost data on first run out of the box.
``provider`` selects the parser/default URL: ``digitalocean`` or
``models.dev``. Both catalog endpoints are public no auth required
so ``planoai obs`` gets cost data on first run out of the box.
"""
provider = (provider or DEFAULT_PRICING_PROVIDER).strip().lower()
resolved_url = url or _DEFAULT_URLS.get(provider, DO_PRICING_URL)
try:
resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS)
resp = requests.get(resolved_url, timeout=FETCH_TIMEOUT_SECS)
resp.raise_for_status()
data = resp.json()
except Exception as exc: # noqa: BLE001 — best-effort; never fatal
logger.warning(
"DO pricing fetch failed: %s; cost column will be blank.",
"%s pricing fetch failed: %s; cost column will be blank.",
provider,
exc,
)
return cls()
prices = _parse_do_pricing(data)
if provider == "models.dev":
prices = _parse_models_dev_pricing(data)
else:
prices = _parse_do_pricing(data)
if not prices:
# Dump the first entry's raw shape so we can see which fields DO
# actually returned — helps when the catalog adds new fields or
# the response doesn't match our parser.
# Dump a sample of the raw shape so we can see which fields the
# catalog returned — helps when it adds new fields or the response
# doesn't match our parser.
import json as _json
sample_items = _coerce_items(data)
sample = sample_items[0] if sample_items else data
if provider == "models.dev" and isinstance(data, dict):
sample = next(iter(data.values()), data)
else:
sample_items = _coerce_items(data)
sample = sample_items[0] if sample_items else data
logger.warning(
"DO pricing response had no parseable entries; cost column "
"%s pricing response had no parseable entries; cost column "
"will be blank. Sample entry: %s",
provider,
_json.dumps(sample, default=str)[:400],
)
return cls(prices)
@ -278,6 +306,75 @@ def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]:
return prices
def _parse_models_dev_pricing(data: Any) -> dict[str, ModelPrice]:
"""Parse a models.dev ``api.json`` response into a ModelPrice map.
models.dev shape (top-level object keyed by provider id)::
{
"anthropic": {
"models": {
"claude-opus-4-5": {
"cost": {"input": 5, "output": 25, "cache_read": 0.5}
}
}
},
...
}
``cost.*`` values are USD per *million* tokens, so we divide by 1e6 to get a
per-token rate. First-party providers use bare model keys, so we register
both ``provider/model`` (matching Plano's routing names) and the bare model
id as a fallback.
"""
prices: dict[str, ModelPrice] = {}
if not isinstance(data, dict):
return prices
for provider_id, provider in data.items():
if not isinstance(provider, dict):
continue
models = provider.get("models")
if not isinstance(models, dict):
continue
for model_key, model in models.items():
if not isinstance(model, dict):
continue
cost = model.get("cost")
if not isinstance(cost, dict):
continue
input_pm = _as_float(cost.get("input"))
output_pm = _as_float(cost.get("output"))
if input_pm is None or output_pm is None:
continue
# Skip 0-rate entries so cost falls back to `—` rather than $0.0000.
if input_pm == 0 and output_pm == 0:
continue
cached_pm = _as_float(cost.get("cache_read"))
price = ModelPrice(
input_per_token_usd=input_pm / 1_000_000,
output_per_token_usd=output_pm / 1_000_000,
cached_input_per_token_usd=(
cached_pm / 1_000_000 if cached_pm is not None else None
),
)
composite = f"{provider_id}/{model_key}"
prices[composite] = price
prices.setdefault(composite.lower(), price)
prices.setdefault(str(model_key), price)
prices.setdefault(str(model_key).lower(), price)
return prices
def _as_float(value: Any) -> float | None:
if value is None:
return None
try:
return float(value)
except (TypeError, ValueError):
return None
def _coerce_items(data: Any) -> list[dict]:
if isinstance(data, list):
return [x for x in data if isinstance(x, dict)]

View file

@ -2,9 +2,12 @@
from __future__ import annotations
import logging
import os
import time
import rich_click as click
import yaml
from rich.console import Console
from rich.live import Live
@ -15,8 +18,50 @@ from planoai.obs.collector import (
LLMCallStore,
ObsCollector,
)
from planoai.obs.pricing import PricingCatalog
from planoai.obs.pricing import DEFAULT_PRICING_PROVIDER, PricingCatalog
from planoai.obs.render import render
from planoai.utils import find_config_file
logger = logging.getLogger(__name__)
def _resolve_pricing_source(
config_file: str | None,
provider_override: str | None,
url_override: str | None,
) -> tuple[str, str | None]:
"""Pick the cost pricing source.
Precedence: explicit CLI overrides > the first ``type: cost`` entry in
``model_metrics_sources`` from the Plano config > the DigitalOcean default.
"""
provider = DEFAULT_PRICING_PROVIDER
url: str | None = None
config_path = find_config_file(file=config_file)
if config_path and os.path.exists(config_path):
try:
with open(config_path, "r") as f:
config = yaml.safe_load(f) or {}
sources = config.get("model_metrics_sources") or []
for source in sources:
if isinstance(source, dict) and source.get("type") == "cost":
if source.get("provider"):
provider = str(source["provider"])
if source.get("url"):
url = str(source["url"])
break
except Exception as exc: # noqa: BLE001 — config is optional for obs
logger.warning(
"could not read pricing source from %s: %s", config_path, exc
)
if provider_override:
provider = provider_override
if url_override:
url = url_override
return provider, url
@click.command(name="obs", help="Live observability console for Plano LLM traffic.")
@ -48,13 +93,42 @@ from planoai.obs.render import render
show_default=True,
help="TUI refresh interval.",
)
def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
@click.option(
"--config",
"config_file",
type=str,
default=None,
help="Path to the Plano config to read the pricing source from "
"(defaults to ./config.yaml or ./plano_config.yaml).",
)
@click.option(
"--pricing-provider",
type=click.Choice(["digitalocean", "models.dev"]),
default=None,
help="Override the cost pricing provider (otherwise read from config).",
)
@click.option(
"--pricing-url",
type=str,
default=None,
help="Override the pricing catalog URL (otherwise read from config / provider default).",
)
def obs(
port: int,
host: str,
capacity: int,
refresh_ms: int,
config_file: str | None,
pricing_provider: str | None,
pricing_url: str | None,
) -> None:
console = Console()
provider, url = _resolve_pricing_source(config_file, pricing_provider, pricing_url)
console.print(
f"[bold {PLANO_COLOR}]planoai obs[/] — loading DO pricing catalog...",
f"[bold {PLANO_COLOR}]planoai obs[/] — loading {provider} pricing catalog...",
end="",
)
pricing = PricingCatalog.fetch()
pricing = PricingCatalog.fetch(provider=provider, url=url)
if len(pricing):
sample = ", ".join(pricing.sample_models(3))
console.print(
@ -63,7 +137,7 @@ def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
else:
console.print(
" [yellow]no pricing loaded[/] — "
"[dim]cost column will be blank (DO catalog unreachable)[/]"
f"[dim]cost column will be blank ({provider} catalog unreachable)[/]"
)
store = LLMCallStore(capacity=capacity)

View file

@ -144,3 +144,68 @@ def test_parse_do_catalog_divides_large_values_as_per_million():
prices = _parse_do_pricing(sample)
assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000
assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000
_MODELS_DEV_SAMPLE = {
"anthropic": {
"id": "anthropic",
"models": {
"claude-opus-4-5": {
"id": "claude-opus-4-5",
"cost": {"input": 5, "output": 25, "cache_read": 0.5},
}
},
},
"groq": {
"id": "groq",
"models": {
"llama-3.3-70b-versatile": {
"id": "llama-3.3-70b-versatile",
"cost": {"input": 0.59, "output": 0.79},
},
# No cost block → skipped.
"whisper-large-v3-turbo": {"id": "whisper-large-v3-turbo"},
},
},
}
def test_parse_models_dev_composes_provider_keys_and_per_token_rates():
from planoai.obs.pricing import _parse_models_dev_pricing
prices = _parse_models_dev_pricing(_MODELS_DEV_SAMPLE)
# models.dev cost values are per-million → divided by 1e6.
opus = prices["anthropic/claude-opus-4-5"]
assert opus.input_per_token_usd == 5 / 1_000_000
assert opus.output_per_token_usd == 25 / 1_000_000
assert opus.cached_input_per_token_usd == 0.5 / 1_000_000
# Composite provider/model keys match Plano's routing names.
assert "groq/llama-3.3-70b-versatile" in prices
# Bare model id registered as a fallback.
assert "llama-3.3-70b-versatile" in prices
# Models without a cost block are skipped.
assert "groq/whisper-large-v3-turbo" not in prices
def test_models_dev_catalog_cost_computation():
from planoai.obs.pricing import PricingCatalog, _parse_models_dev_pricing
catalog = PricingCatalog(_parse_models_dev_pricing(_MODELS_DEV_SAMPLE))
# 1000 input @ 5e-6 = 0.005; 500 output @ 25e-6 = 0.0125
cost = catalog.cost_for_call(_call("anthropic/claude-opus-4-5", 1000, 500))
assert cost == round(0.005 + 0.0125, 6)
def test_models_dev_skips_zero_rate_entries():
from planoai.obs.pricing import _parse_models_dev_pricing
sample = {
"free": {
"models": {
"promo-model": {"cost": {"input": 0, "output": 0}},
}
}
}
assert _parse_models_dev_pricing(sample) == {}

View file

@ -582,13 +582,17 @@ properties:
type: string
enum:
- digitalocean
- models.dev
url:
type: string
description: "Optional override for the pricing catalog endpoint. Defaults per provider (digitalocean: DO GenAI catalog; models.dev: https://models.dev/api.json)."
refresh_interval:
type: integer
minimum: 1
description: "Refresh interval in seconds"
model_aliases:
type: object
description: "Map DO catalog keys (lowercase(creator)/model_id) to Plano model names used in routing_preferences. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'"
description: "Map catalog keys to Plano model names used in routing_preferences. DigitalOcean keys are 'lowercase(creator)/model_id'; models.dev keys are 'creator/model_id'. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'"
additionalProperties:
type: string
required:

View file

@ -9,6 +9,7 @@ use tokio::sync::RwLock;
use tracing::{debug, info, warn};
const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog";
const MODELS_DEV_URL: &str = "https://models.dev/api.json";
pub struct ModelMetricsService {
cost: Arc<RwLock<HashMap<String, f64>>>,
@ -22,28 +23,35 @@ impl ModelMetricsService {
for source in sources {
match source {
MetricsSource::Cost(cfg) => match cfg.provider {
CostProvider::Digitalocean => {
let aliases = cfg.model_aliases.clone().unwrap_or_default();
let data = fetch_do_pricing(&client, &aliases).await;
info!(models = data.len(), "fetched digitalocean pricing");
*cost_data.write().await = data;
MetricsSource::Cost(cfg) => {
let provider = cfg.provider.clone();
let url = cfg
.url
.clone()
.unwrap_or_else(|| default_cost_url(&provider).to_string());
let aliases = cfg.model_aliases.clone().unwrap_or_default();
let provider_name = cost_provider_name(&provider);
if let Some(interval_secs) = cfg.refresh_interval {
let cost_clone = Arc::clone(&cost_data);
let client_clone = client.clone();
let interval = Duration::from_secs(interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data = fetch_do_pricing(&client_clone, &aliases).await;
info!(models = data.len(), "refreshed digitalocean pricing");
*cost_clone.write().await = data;
}
});
}
let data = fetch_cost_pricing(&provider, &url, &client, &aliases).await;
info!(models = data.len(), provider = provider_name, url = %url, "fetched cost pricing");
*cost_data.write().await = data;
if let Some(interval_secs) = cfg.refresh_interval {
let cost_clone = Arc::clone(&cost_data);
let client_clone = client.clone();
let interval = Duration::from_secs(interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data =
fetch_cost_pricing(&provider, &url, &client_clone, &aliases)
.await;
info!(models = data.len(), provider = provider_name, url = %url, "refreshed cost pricing");
*cost_clone.write().await = data;
}
});
}
},
}
MetricsSource::Latency(cfg) => match cfg.provider {
LatencyProvider::Prometheus => {
let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await;
@ -165,11 +173,55 @@ struct DoPricing {
output_price_per_million: Option<f64>,
}
async fn fetch_do_pricing(
#[derive(serde::Deserialize)]
struct ModelsDevProvider {
#[serde(default)]
models: HashMap<String, ModelsDevModel>,
}
#[derive(serde::Deserialize)]
struct ModelsDevModel {
cost: Option<ModelsDevCost>,
}
#[derive(serde::Deserialize)]
struct ModelsDevCost {
input: Option<f64>,
output: Option<f64>,
}
fn default_cost_url(provider: &CostProvider) -> &'static str {
match provider {
CostProvider::Digitalocean => DO_PRICING_URL,
CostProvider::ModelsDev => MODELS_DEV_URL,
}
}
fn cost_provider_name(provider: &CostProvider) -> &'static str {
match provider {
CostProvider::Digitalocean => "digitalocean",
CostProvider::ModelsDev => "models.dev",
}
}
async fn fetch_cost_pricing(
provider: &CostProvider,
url: &str,
client: &reqwest::Client,
aliases: &HashMap<String, String>,
) -> HashMap<String, f64> {
match client.get(DO_PRICING_URL).send().await {
match provider {
CostProvider::Digitalocean => fetch_do_pricing(url, client, aliases).await,
CostProvider::ModelsDev => fetch_models_dev_pricing(url, client, aliases).await,
}
}
async fn fetch_do_pricing(
url: &str,
client: &reqwest::Client,
aliases: &HashMap<String, String>,
) -> HashMap<String, f64> {
match client.get(url).send().await {
Ok(resp) => match resp.json::<DoModelList>().await {
Ok(list) => list
.data
@ -184,17 +236,66 @@ async fn fetch_do_pricing(
})
.collect(),
Err(err) => {
warn!(error = %err, url = DO_PRICING_URL, "failed to parse digitalocean pricing response");
warn!(error = %err, url = %url, "failed to parse digitalocean pricing response");
HashMap::new()
}
},
Err(err) => {
warn!(error = %err, url = DO_PRICING_URL, "failed to fetch digitalocean pricing");
warn!(error = %err, url = %url, "failed to fetch digitalocean pricing");
HashMap::new()
}
}
}
/// models.dev publishes a top-level object keyed by provider id; each provider
/// carries a `models` map whose keys are `creator/model` ids and whose `cost`
/// block holds per-million USD rates. We sum input + output (mirroring the DO
/// ranking metric) and key the result by `creator/model_id` so it lines up with
/// Plano's `provider/model` routing names.
async fn fetch_models_dev_pricing(
url: &str,
client: &reqwest::Client,
aliases: &HashMap<String, String>,
) -> HashMap<String, f64> {
match client.get(url).send().await {
Ok(resp) => match resp.json::<HashMap<String, ModelsDevProvider>>().await {
Ok(providers) => parse_models_dev_pricing(providers, aliases),
Err(err) => {
warn!(error = %err, url = %url, "failed to parse models.dev pricing response");
HashMap::new()
}
},
Err(err) => {
warn!(error = %err, url = %url, "failed to fetch models.dev pricing");
HashMap::new()
}
}
}
fn parse_models_dev_pricing(
providers: HashMap<String, ModelsDevProvider>,
aliases: &HashMap<String, String>,
) -> HashMap<String, f64> {
let mut out = HashMap::new();
for (provider_id, provider) in providers {
for (model_key, model) in provider.models {
let Some(cost) = model.cost else { continue };
let (Some(input), Some(output)) = (cost.input, cost.output) else {
continue;
};
// First-party providers use bare model keys (`claude-opus-4-5`),
// so compose `provider/model` to line up with Plano routing names.
let raw_key = format!("{provider_id}/{model_key}");
let total = input + output;
let key = aliases.get(&raw_key).cloned().unwrap_or(raw_key);
out.insert(key, total);
// Also register the bare model id as a fallback lookup.
out.entry(model_key).or_insert(total);
}
}
out
}
#[derive(serde::Deserialize)]
struct PrometheusResponse {
data: PrometheusData,
@ -368,6 +469,50 @@ mod tests {
assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
}
#[test]
fn test_parse_models_dev_pricing_composes_provider_keys() {
let json = r#"{
"anthropic": {
"models": {
"claude-opus-4-5": {"cost": {"input": 5.0, "output": 25.0}}
}
},
"groq": {
"models": {
"llama-3.3-70b-versatile": {"cost": {"input": 0.59, "output": 0.79}},
"whisper-large-v3-turbo": {"cost": null}
}
}
}"#;
let providers: HashMap<String, ModelsDevProvider> = serde_json::from_str(json).unwrap();
let aliases = HashMap::new();
let prices = parse_models_dev_pricing(providers, &aliases);
assert_eq!(prices.get("anthropic/claude-opus-4-5"), Some(&30.0));
assert_eq!(prices.get("groq/llama-3.3-70b-versatile"), Some(&1.38));
// bare fallback also registered
assert_eq!(prices.get("claude-opus-4-5"), Some(&30.0));
// models with no cost block are skipped
assert!(!prices.contains_key("groq/whisper-large-v3-turbo"));
}
#[test]
fn test_parse_models_dev_pricing_applies_aliases() {
let json = r#"{
"openai": {"models": {"gpt-oss-120b": {"cost": {"input": 1.0, "output": 2.0}}}}
}"#;
let providers: HashMap<String, ModelsDevProvider> = serde_json::from_str(json).unwrap();
let mut aliases = HashMap::new();
aliases.insert(
"openai/gpt-oss-120b".to_string(),
"openai/gpt-4o".to_string(),
);
let prices = parse_models_dev_pricing(providers, &aliases);
assert_eq!(prices.get("openai/gpt-4o"), Some(&3.0));
assert!(!prices.contains_key("openai/gpt-oss-120b"));
}
#[test]
fn test_rank_by_ascending_metric_nan_treated_as_missing() {
let models = vec![

View file

@ -177,8 +177,13 @@ pub enum MetricsSource {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CostMetricsConfig {
pub provider: CostProvider,
/// Optional override for the pricing catalog endpoint. When omitted, a
/// sensible default is used per provider.
pub url: Option<String>,
pub refresh_interval: Option<u64>,
/// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
/// Map catalog keys to Plano model names used in `routing_preferences`.
/// DigitalOcean keys look like `lowercase(creator)/model_id`; models.dev
/// keys look like `creator/model_id`.
/// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
pub model_aliases: Option<HashMap<String, String>>,
}
@ -187,6 +192,8 @@ pub struct CostMetricsConfig {
#[serde(rename_all = "snake_case")]
pub enum CostProvider {
Digitalocean,
#[serde(rename = "models.dev")]
ModelsDev,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -741,6 +748,51 @@ mod test {
}
}
#[test]
fn test_deserialize_models_dev_cost_source() {
let yaml = r#"
- type: cost
provider: models.dev
url: https://models.dev/api.json
refresh_interval: 3600
model_aliases:
openai/gpt-oss-120b: openai/gpt-4o
"#;
let sources: Vec<super::MetricsSource> = serde_yaml::from_str(yaml).unwrap();
assert_eq!(sources.len(), 1);
match &sources[0] {
super::MetricsSource::Cost(cfg) => {
assert!(matches!(cfg.provider, super::CostProvider::ModelsDev));
assert_eq!(cfg.url.as_deref(), Some("https://models.dev/api.json"));
assert_eq!(cfg.refresh_interval, Some(3600));
assert_eq!(
cfg.model_aliases
.as_ref()
.and_then(|m| m.get("openai/gpt-oss-120b"))
.map(String::as_str),
Some("openai/gpt-4o")
);
}
other => panic!("expected cost source, got {other:?}"),
}
}
#[test]
fn test_deserialize_digitalocean_cost_source_without_url() {
let yaml = r#"
- type: cost
provider: digitalocean
"#;
let sources: Vec<super::MetricsSource> = serde_yaml::from_str(yaml).unwrap();
match &sources[0] {
super::MetricsSource::Cost(cfg) => {
assert!(matches!(cfg.provider, super::CostProvider::Digitalocean));
assert_eq!(cfg.url, None);
}
other => panic!("expected cost source, got {other:?}"),
}
}
#[test]
fn test_into_models_filters_internal_providers() {
let providers = vec![

View file

@ -209,6 +209,178 @@ Clients can let the router decide or still specify aliases:
)
.. _cost_latency_aware_selection:
Cost- and latency-aware selection
---------------------------------
When a route lists more than one candidate model, you can let Plano reorder that
candidate pool using **live cost or latency data** instead of relying solely on the
order you wrote them in. This is controlled per route with ``selection_policy`` and
backed by one or more ``model_metrics_sources``.
This is useful when several models are equally capable for a route and you want Plano
to always reach for the cheapest (or fastest) option first, with the others kept as
fallbacks.
Selection policy
~~~~~~~~~~~~~~~~~
Attach an optional ``selection_policy`` to any entry in ``routing_preferences``:
.. code-block:: yaml
:caption: Per-route selection policy
routing_preferences:
- name: code review
description: reviewing, analyzing, and suggesting improvements to existing code
models:
- anthropic/claude-sonnet-4-5
- groq/llama-3.3-70b-versatile
selection_policy:
prefer: cheapest # cheapest | fastest | none
``prefer`` accepts:
- ``cheapest`` — order candidates by total price (input + output rate) ascending, using a ``cost`` metrics source.
- ``fastest`` — order candidates by observed latency ascending, using a ``latency`` metrics source.
- ``none`` (default) — keep the order you declared; no reordering.
Models that have no data in the selected source are ranked **last**, in their original
order, so routing always degrades gracefully rather than dropping a candidate.
Configuring the pricing source
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``cheapest`` routing needs a price catalog. Plano's **default pricing provider is
DigitalOcean** — its GenAI model catalog is public (no API key, no signup), so cost data
is available out of the box and is what ``planoai obs`` uses if you don't configure
anything. The pricing source is fully swappable: point Plano at `models.dev <https://models.dev/>`_,
or at **any endpoint that exposes a supported pricing structure**.
The ``provider`` field selects which response schema Plano expects (and therefore how it
parses the catalog); the optional ``url`` lets you override the endpoint — for example to
use a mirror, a cached copy, or an internal catalog service that returns the same shape.
.. list-table::
:header-rows: 1
:widths: 18 34 28 20
* - ``provider``
- Default catalog URL
- Key format
- Expected structure
* - ``digitalocean`` *(default)*
- DigitalOcean GenAI model catalog
- ``lowercase(creator)/model_id``
- ``{ data: [ { model_id, pricing: { input_price_per_million, output_price_per_million } } ] }``
* - ``models.dev``
- ``https://models.dev/api.json``
- ``creator/model`` (e.g. ``anthropic/claude-sonnet-4-5``)
- ``{ <provider>: { models: { <model>: { cost: { input, output } } } } }``
Because the source is selected per ``provider``, switching is a one-line change. To stay
on the default DigitalOcean catalog you can omit ``model_metrics_sources`` entirely for
``planoai obs``, or declare it explicitly for routing:
.. code-block:: yaml
:caption: Default cost source (DigitalOcean)
model_metrics_sources:
- type: cost
provider: digitalocean # default; uses the public DO GenAI catalog
To switch to models.dev — an open, community-maintained catalog covering a broad range of
providers and models — change the ``provider`` (and optionally ``url``):
.. code-block:: yaml
:caption: Cost source backed by models.dev
model_metrics_sources:
- type: cost
provider: models.dev # models.dev | digitalocean
url: https://models.dev/api.json # optional; defaults per provider
refresh_interval: 3600 # optional, seconds; refetch on this interval
model_aliases: # optional; see below
openai/gpt-oss-120b: openai/gpt-4o
To use your own endpoint, pick the ``provider`` whose structure your endpoint matches and
override ``url`` — Plano parses the response with that provider's schema:
.. code-block:: yaml
:caption: Custom endpoint exposing the DigitalOcean catalog structure
model_metrics_sources:
- type: cost
provider: digitalocean # selects the DO response schema
url: https://catalog.internal.example.com/pricing
.. note::
The cost metric used for ranking is the sum of the input and output per-million-token
rates — a relative signal for ordering candidates, not a per-request bill. For actual
per-request cost, see the observability console below.
Matching catalog keys to your models
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The router looks up each candidate model by the exact name you use in
``routing_preferences`` (e.g. ``anthropic/claude-sonnet-4-5``). models.dev keys models as
``creator/model``, which lines up with Plano's ``provider/model`` naming, so most models
match automatically.
When a catalog key does not match your model name — for example a version skew, or an
open-weight model you serve under a different provider — use ``model_aliases`` to map the
**catalog key** to the **Plano model name** used in your routing preferences:
.. code-block:: yaml
model_metrics_sources:
- type: cost
provider: models.dev
model_aliases:
# catalog key : plano model name
openai/gpt-oss-120b: openai/gpt-4o
Latency source
~~~~~~~~~~~~~~~
``fastest`` routing reads observed latency from a Prometheus instance. Provide the query
that returns a per-model latency value (lower is faster), labelled by ``model_name``:
.. code-block:: yaml
:caption: Latency source backed by Prometheus
model_metrics_sources:
- type: latency
provider: prometheus
url: http://prometheus:9090
query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m]))
refresh_interval: 60
You can declare both a ``cost`` and a ``latency`` source at the same time; each route
picks whichever it needs based on its ``selection_policy``.
Cost in the observability console
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``planoai obs`` displays a per-request USD cost column derived from the same pricing
catalog. By default it reads the ``cost`` source from your config (the first
``type: cost`` entry under ``model_metrics_sources``); you can also override it on the
command line:
.. code-block:: bash
# Use the cost source from ./config.yaml (default)
planoai obs
# Or override the provider / endpoint explicitly
planoai obs --pricing-provider models.dev
planoai obs --pricing-url https://models.dev/api.json
If no source is configured and no override is given, ``planoai obs`` falls back to the
DigitalOcean catalog so the cost column still populates out of the box.
Plano-Orchestrator
-------------------
Plano-Orchestrator is a **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. It delivers production-ready performance with low latency and high accuracy while solving key routing challenges.

View file

@ -86,6 +86,24 @@ routing_preferences:
selection_policy:
prefer: cheapest
# model_metrics_sources: external catalogs the router reads to reorder candidate
# models for selection_policy.prefer. A `cost` source ranks `prefer: cheapest`;
# a `latency` source ranks `prefer: fastest`. Both are optional.
model_metrics_sources:
# Cost catalog. provider: models.dev | digitalocean (default url per provider).
- type: cost
provider: models.dev
url: https://models.dev/api.json # optional; omit to use the provider default
refresh_interval: 3600 # optional, seconds
model_aliases: # optional: catalog key -> Plano model name
openai/gpt-oss-120b: openai/gpt-4o
# Latency catalog (Prometheus). Used for selection_policy.prefer: fastest.
- type: latency
provider: prometheus
url: http://prometheus:9090
query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m]))
refresh_interval: 60
# HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access
listeners:
# Agent listener for routing requests to multiple agents

View file

@ -115,6 +115,18 @@ model_aliases:
target: gpt-4o-mini
smart-llm:
target: gpt-4o
model_metrics_sources:
- model_aliases:
openai/gpt-oss-120b: openai/gpt-4o
provider: models.dev
refresh_interval: 3600
type: cost
url: https://models.dev/api.json
- provider: prometheus
query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m]))
refresh_interval: 60
type: latency
url: http://prometheus:9090
model_providers:
- access_key: $OPENAI_API_KEY
default: true