mirror of
https://github.com/katanemo/plano.git
synced 2026-06-26 15:39:40 +02:00
feat: make model pricing source configurable (models.dev + DigitalOcean) (#971)
This commit is contained in:
parent
5cc4c4ee77
commit
558df0307c
9 changed files with 687 additions and 48 deletions
|
|
@ -1,7 +1,8 @@
|
|||
"""DigitalOcean Gradient pricing catalog for the obs console.
|
||||
"""Model pricing catalog for the obs console.
|
||||
|
||||
Ported loosely from ``crates/brightstaff/src/router/model_metrics.rs::fetch_do_pricing``.
|
||||
Single-source: one fetch at startup, cached for the life of the process.
|
||||
Mirrors ``crates/brightstaff/src/router/model_metrics.rs``. The source is
|
||||
configurable: ``digitalocean`` (DO GenAI catalog) or ``models.dev``. A single
|
||||
fetch at startup is cached for the life of the process.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -14,7 +15,18 @@ from typing import Any
|
|||
|
||||
import requests
|
||||
|
||||
DEFAULT_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog"
|
||||
DO_PRICING_URL = "https://api.digitalocean.com/v2/gen-ai/models/catalog"
|
||||
MODELS_DEV_URL = "https://models.dev/api.json"
|
||||
|
||||
# Backwards-compatible default (DigitalOcean) used when no provider is given.
|
||||
DEFAULT_PRICING_URL = DO_PRICING_URL
|
||||
DEFAULT_PRICING_PROVIDER = "digitalocean"
|
||||
|
||||
_DEFAULT_URLS = {
|
||||
"digitalocean": DO_PRICING_URL,
|
||||
"models.dev": MODELS_DEV_URL,
|
||||
}
|
||||
|
||||
FETCH_TIMEOUT_SECS = 5.0
|
||||
|
||||
|
||||
|
|
@ -51,36 +63,52 @@ class PricingCatalog:
|
|||
return list(self._prices.keys())[:n]
|
||||
|
||||
@classmethod
|
||||
def fetch(cls, url: str = DEFAULT_PRICING_URL) -> "PricingCatalog":
|
||||
"""Fetch pricing from DO's catalog endpoint. On failure, returns an
|
||||
def fetch(
|
||||
cls,
|
||||
provider: str = DEFAULT_PRICING_PROVIDER,
|
||||
url: str | None = None,
|
||||
) -> "PricingCatalog":
|
||||
"""Fetch pricing from the configured catalog. On failure, returns an
|
||||
empty catalog (cost column will be blank).
|
||||
|
||||
The catalog endpoint is public — no auth required, no signup — so
|
||||
``planoai obs`` gets cost data on first run out of the box.
|
||||
``provider`` selects the parser/default URL: ``digitalocean`` or
|
||||
``models.dev``. Both catalog endpoints are public — no auth required —
|
||||
so ``planoai obs`` gets cost data on first run out of the box.
|
||||
"""
|
||||
provider = (provider or DEFAULT_PRICING_PROVIDER).strip().lower()
|
||||
resolved_url = url or _DEFAULT_URLS.get(provider, DO_PRICING_URL)
|
||||
try:
|
||||
resp = requests.get(url, timeout=FETCH_TIMEOUT_SECS)
|
||||
resp = requests.get(resolved_url, timeout=FETCH_TIMEOUT_SECS)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except Exception as exc: # noqa: BLE001 — best-effort; never fatal
|
||||
logger.warning(
|
||||
"DO pricing fetch failed: %s; cost column will be blank.",
|
||||
"%s pricing fetch failed: %s; cost column will be blank.",
|
||||
provider,
|
||||
exc,
|
||||
)
|
||||
return cls()
|
||||
|
||||
prices = _parse_do_pricing(data)
|
||||
if provider == "models.dev":
|
||||
prices = _parse_models_dev_pricing(data)
|
||||
else:
|
||||
prices = _parse_do_pricing(data)
|
||||
|
||||
if not prices:
|
||||
# Dump the first entry's raw shape so we can see which fields DO
|
||||
# actually returned — helps when the catalog adds new fields or
|
||||
# the response doesn't match our parser.
|
||||
# Dump a sample of the raw shape so we can see which fields the
|
||||
# catalog returned — helps when it adds new fields or the response
|
||||
# doesn't match our parser.
|
||||
import json as _json
|
||||
|
||||
sample_items = _coerce_items(data)
|
||||
sample = sample_items[0] if sample_items else data
|
||||
if provider == "models.dev" and isinstance(data, dict):
|
||||
sample = next(iter(data.values()), data)
|
||||
else:
|
||||
sample_items = _coerce_items(data)
|
||||
sample = sample_items[0] if sample_items else data
|
||||
logger.warning(
|
||||
"DO pricing response had no parseable entries; cost column "
|
||||
"%s pricing response had no parseable entries; cost column "
|
||||
"will be blank. Sample entry: %s",
|
||||
provider,
|
||||
_json.dumps(sample, default=str)[:400],
|
||||
)
|
||||
return cls(prices)
|
||||
|
|
@ -278,6 +306,75 @@ def _parse_do_pricing(data: Any) -> dict[str, ModelPrice]:
|
|||
return prices
|
||||
|
||||
|
||||
def _parse_models_dev_pricing(data: Any) -> dict[str, ModelPrice]:
|
||||
"""Parse a models.dev ``api.json`` response into a ModelPrice map.
|
||||
|
||||
models.dev shape (top-level object keyed by provider id)::
|
||||
|
||||
{
|
||||
"anthropic": {
|
||||
"models": {
|
||||
"claude-opus-4-5": {
|
||||
"cost": {"input": 5, "output": 25, "cache_read": 0.5}
|
||||
}
|
||||
}
|
||||
},
|
||||
...
|
||||
}
|
||||
|
||||
``cost.*`` values are USD per *million* tokens, so we divide by 1e6 to get a
|
||||
per-token rate. First-party providers use bare model keys, so we register
|
||||
both ``provider/model`` (matching Plano's routing names) and the bare model
|
||||
id as a fallback.
|
||||
"""
|
||||
prices: dict[str, ModelPrice] = {}
|
||||
if not isinstance(data, dict):
|
||||
return prices
|
||||
|
||||
for provider_id, provider in data.items():
|
||||
if not isinstance(provider, dict):
|
||||
continue
|
||||
models = provider.get("models")
|
||||
if not isinstance(models, dict):
|
||||
continue
|
||||
for model_key, model in models.items():
|
||||
if not isinstance(model, dict):
|
||||
continue
|
||||
cost = model.get("cost")
|
||||
if not isinstance(cost, dict):
|
||||
continue
|
||||
input_pm = _as_float(cost.get("input"))
|
||||
output_pm = _as_float(cost.get("output"))
|
||||
if input_pm is None or output_pm is None:
|
||||
continue
|
||||
# Skip 0-rate entries so cost falls back to `—` rather than $0.0000.
|
||||
if input_pm == 0 and output_pm == 0:
|
||||
continue
|
||||
cached_pm = _as_float(cost.get("cache_read"))
|
||||
price = ModelPrice(
|
||||
input_per_token_usd=input_pm / 1_000_000,
|
||||
output_per_token_usd=output_pm / 1_000_000,
|
||||
cached_input_per_token_usd=(
|
||||
cached_pm / 1_000_000 if cached_pm is not None else None
|
||||
),
|
||||
)
|
||||
composite = f"{provider_id}/{model_key}"
|
||||
prices[composite] = price
|
||||
prices.setdefault(composite.lower(), price)
|
||||
prices.setdefault(str(model_key), price)
|
||||
prices.setdefault(str(model_key).lower(), price)
|
||||
return prices
|
||||
|
||||
|
||||
def _as_float(value: Any) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _coerce_items(data: Any) -> list[dict]:
|
||||
if isinstance(data, list):
|
||||
return [x for x in data if isinstance(x, dict)]
|
||||
|
|
|
|||
|
|
@ -2,9 +2,12 @@
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
import rich_click as click
|
||||
import yaml
|
||||
from rich.console import Console
|
||||
from rich.live import Live
|
||||
|
||||
|
|
@ -15,8 +18,50 @@ from planoai.obs.collector import (
|
|||
LLMCallStore,
|
||||
ObsCollector,
|
||||
)
|
||||
from planoai.obs.pricing import PricingCatalog
|
||||
from planoai.obs.pricing import DEFAULT_PRICING_PROVIDER, PricingCatalog
|
||||
from planoai.obs.render import render
|
||||
from planoai.utils import find_config_file
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _resolve_pricing_source(
|
||||
config_file: str | None,
|
||||
provider_override: str | None,
|
||||
url_override: str | None,
|
||||
) -> tuple[str, str | None]:
|
||||
"""Pick the cost pricing source.
|
||||
|
||||
Precedence: explicit CLI overrides > the first ``type: cost`` entry in
|
||||
``model_metrics_sources`` from the Plano config > the DigitalOcean default.
|
||||
"""
|
||||
provider = DEFAULT_PRICING_PROVIDER
|
||||
url: str | None = None
|
||||
|
||||
config_path = find_config_file(file=config_file)
|
||||
if config_path and os.path.exists(config_path):
|
||||
try:
|
||||
with open(config_path, "r") as f:
|
||||
config = yaml.safe_load(f) or {}
|
||||
sources = config.get("model_metrics_sources") or []
|
||||
for source in sources:
|
||||
if isinstance(source, dict) and source.get("type") == "cost":
|
||||
if source.get("provider"):
|
||||
provider = str(source["provider"])
|
||||
if source.get("url"):
|
||||
url = str(source["url"])
|
||||
break
|
||||
except Exception as exc: # noqa: BLE001 — config is optional for obs
|
||||
logger.warning(
|
||||
"could not read pricing source from %s: %s", config_path, exc
|
||||
)
|
||||
|
||||
if provider_override:
|
||||
provider = provider_override
|
||||
if url_override:
|
||||
url = url_override
|
||||
|
||||
return provider, url
|
||||
|
||||
|
||||
@click.command(name="obs", help="Live observability console for Plano LLM traffic.")
|
||||
|
|
@ -48,13 +93,42 @@ from planoai.obs.render import render
|
|||
show_default=True,
|
||||
help="TUI refresh interval.",
|
||||
)
|
||||
def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
|
||||
@click.option(
|
||||
"--config",
|
||||
"config_file",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the Plano config to read the pricing source from "
|
||||
"(defaults to ./config.yaml or ./plano_config.yaml).",
|
||||
)
|
||||
@click.option(
|
||||
"--pricing-provider",
|
||||
type=click.Choice(["digitalocean", "models.dev"]),
|
||||
default=None,
|
||||
help="Override the cost pricing provider (otherwise read from config).",
|
||||
)
|
||||
@click.option(
|
||||
"--pricing-url",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Override the pricing catalog URL (otherwise read from config / provider default).",
|
||||
)
|
||||
def obs(
|
||||
port: int,
|
||||
host: str,
|
||||
capacity: int,
|
||||
refresh_ms: int,
|
||||
config_file: str | None,
|
||||
pricing_provider: str | None,
|
||||
pricing_url: str | None,
|
||||
) -> None:
|
||||
console = Console()
|
||||
provider, url = _resolve_pricing_source(config_file, pricing_provider, pricing_url)
|
||||
console.print(
|
||||
f"[bold {PLANO_COLOR}]planoai obs[/] — loading DO pricing catalog...",
|
||||
f"[bold {PLANO_COLOR}]planoai obs[/] — loading {provider} pricing catalog...",
|
||||
end="",
|
||||
)
|
||||
pricing = PricingCatalog.fetch()
|
||||
pricing = PricingCatalog.fetch(provider=provider, url=url)
|
||||
if len(pricing):
|
||||
sample = ", ".join(pricing.sample_models(3))
|
||||
console.print(
|
||||
|
|
@ -63,7 +137,7 @@ def obs(port: int, host: str, capacity: int, refresh_ms: int) -> None:
|
|||
else:
|
||||
console.print(
|
||||
" [yellow]no pricing loaded[/] — "
|
||||
"[dim]cost column will be blank (DO catalog unreachable)[/]"
|
||||
f"[dim]cost column will be blank ({provider} catalog unreachable)[/]"
|
||||
)
|
||||
|
||||
store = LLMCallStore(capacity=capacity)
|
||||
|
|
|
|||
|
|
@ -144,3 +144,68 @@ def test_parse_do_catalog_divides_large_values_as_per_million():
|
|||
prices = _parse_do_pricing(sample)
|
||||
assert prices["mystery-model"].input_per_token_usd == 5.0 / 1_000_000
|
||||
assert prices["mystery-model"].output_per_token_usd == 15.0 / 1_000_000
|
||||
|
||||
|
||||
_MODELS_DEV_SAMPLE = {
|
||||
"anthropic": {
|
||||
"id": "anthropic",
|
||||
"models": {
|
||||
"claude-opus-4-5": {
|
||||
"id": "claude-opus-4-5",
|
||||
"cost": {"input": 5, "output": 25, "cache_read": 0.5},
|
||||
}
|
||||
},
|
||||
},
|
||||
"groq": {
|
||||
"id": "groq",
|
||||
"models": {
|
||||
"llama-3.3-70b-versatile": {
|
||||
"id": "llama-3.3-70b-versatile",
|
||||
"cost": {"input": 0.59, "output": 0.79},
|
||||
},
|
||||
# No cost block → skipped.
|
||||
"whisper-large-v3-turbo": {"id": "whisper-large-v3-turbo"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_parse_models_dev_composes_provider_keys_and_per_token_rates():
|
||||
from planoai.obs.pricing import _parse_models_dev_pricing
|
||||
|
||||
prices = _parse_models_dev_pricing(_MODELS_DEV_SAMPLE)
|
||||
|
||||
# models.dev cost values are per-million → divided by 1e6.
|
||||
opus = prices["anthropic/claude-opus-4-5"]
|
||||
assert opus.input_per_token_usd == 5 / 1_000_000
|
||||
assert opus.output_per_token_usd == 25 / 1_000_000
|
||||
assert opus.cached_input_per_token_usd == 0.5 / 1_000_000
|
||||
|
||||
# Composite provider/model keys match Plano's routing names.
|
||||
assert "groq/llama-3.3-70b-versatile" in prices
|
||||
# Bare model id registered as a fallback.
|
||||
assert "llama-3.3-70b-versatile" in prices
|
||||
# Models without a cost block are skipped.
|
||||
assert "groq/whisper-large-v3-turbo" not in prices
|
||||
|
||||
|
||||
def test_models_dev_catalog_cost_computation():
|
||||
from planoai.obs.pricing import PricingCatalog, _parse_models_dev_pricing
|
||||
|
||||
catalog = PricingCatalog(_parse_models_dev_pricing(_MODELS_DEV_SAMPLE))
|
||||
# 1000 input @ 5e-6 = 0.005; 500 output @ 25e-6 = 0.0125
|
||||
cost = catalog.cost_for_call(_call("anthropic/claude-opus-4-5", 1000, 500))
|
||||
assert cost == round(0.005 + 0.0125, 6)
|
||||
|
||||
|
||||
def test_models_dev_skips_zero_rate_entries():
|
||||
from planoai.obs.pricing import _parse_models_dev_pricing
|
||||
|
||||
sample = {
|
||||
"free": {
|
||||
"models": {
|
||||
"promo-model": {"cost": {"input": 0, "output": 0}},
|
||||
}
|
||||
}
|
||||
}
|
||||
assert _parse_models_dev_pricing(sample) == {}
|
||||
|
|
|
|||
|
|
@ -582,13 +582,17 @@ properties:
|
|||
type: string
|
||||
enum:
|
||||
- digitalocean
|
||||
- models.dev
|
||||
url:
|
||||
type: string
|
||||
description: "Optional override for the pricing catalog endpoint. Defaults per provider (digitalocean: DO GenAI catalog; models.dev: https://models.dev/api.json)."
|
||||
refresh_interval:
|
||||
type: integer
|
||||
minimum: 1
|
||||
description: "Refresh interval in seconds"
|
||||
model_aliases:
|
||||
type: object
|
||||
description: "Map DO catalog keys (lowercase(creator)/model_id) to Plano model names used in routing_preferences. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'"
|
||||
description: "Map catalog keys to Plano model names used in routing_preferences. DigitalOcean keys are 'lowercase(creator)/model_id'; models.dev keys are 'creator/model_id'. Example: 'openai/openai-gpt-oss-120b: openai/gpt-4o'"
|
||||
additionalProperties:
|
||||
type: string
|
||||
required:
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ use tokio::sync::RwLock;
|
|||
use tracing::{debug, info, warn};
|
||||
|
||||
const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog";
|
||||
const MODELS_DEV_URL: &str = "https://models.dev/api.json";
|
||||
|
||||
pub struct ModelMetricsService {
|
||||
cost: Arc<RwLock<HashMap<String, f64>>>,
|
||||
|
|
@ -22,28 +23,35 @@ impl ModelMetricsService {
|
|||
|
||||
for source in sources {
|
||||
match source {
|
||||
MetricsSource::Cost(cfg) => match cfg.provider {
|
||||
CostProvider::Digitalocean => {
|
||||
let aliases = cfg.model_aliases.clone().unwrap_or_default();
|
||||
let data = fetch_do_pricing(&client, &aliases).await;
|
||||
info!(models = data.len(), "fetched digitalocean pricing");
|
||||
*cost_data.write().await = data;
|
||||
MetricsSource::Cost(cfg) => {
|
||||
let provider = cfg.provider.clone();
|
||||
let url = cfg
|
||||
.url
|
||||
.clone()
|
||||
.unwrap_or_else(|| default_cost_url(&provider).to_string());
|
||||
let aliases = cfg.model_aliases.clone().unwrap_or_default();
|
||||
let provider_name = cost_provider_name(&provider);
|
||||
|
||||
if let Some(interval_secs) = cfg.refresh_interval {
|
||||
let cost_clone = Arc::clone(&cost_data);
|
||||
let client_clone = client.clone();
|
||||
let interval = Duration::from_secs(interval_secs);
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
tokio::time::sleep(interval).await;
|
||||
let data = fetch_do_pricing(&client_clone, &aliases).await;
|
||||
info!(models = data.len(), "refreshed digitalocean pricing");
|
||||
*cost_clone.write().await = data;
|
||||
}
|
||||
});
|
||||
}
|
||||
let data = fetch_cost_pricing(&provider, &url, &client, &aliases).await;
|
||||
info!(models = data.len(), provider = provider_name, url = %url, "fetched cost pricing");
|
||||
*cost_data.write().await = data;
|
||||
|
||||
if let Some(interval_secs) = cfg.refresh_interval {
|
||||
let cost_clone = Arc::clone(&cost_data);
|
||||
let client_clone = client.clone();
|
||||
let interval = Duration::from_secs(interval_secs);
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
tokio::time::sleep(interval).await;
|
||||
let data =
|
||||
fetch_cost_pricing(&provider, &url, &client_clone, &aliases)
|
||||
.await;
|
||||
info!(models = data.len(), provider = provider_name, url = %url, "refreshed cost pricing");
|
||||
*cost_clone.write().await = data;
|
||||
}
|
||||
});
|
||||
}
|
||||
},
|
||||
}
|
||||
MetricsSource::Latency(cfg) => match cfg.provider {
|
||||
LatencyProvider::Prometheus => {
|
||||
let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await;
|
||||
|
|
@ -165,11 +173,55 @@ struct DoPricing {
|
|||
output_price_per_million: Option<f64>,
|
||||
}
|
||||
|
||||
async fn fetch_do_pricing(
|
||||
#[derive(serde::Deserialize)]
|
||||
struct ModelsDevProvider {
|
||||
#[serde(default)]
|
||||
models: HashMap<String, ModelsDevModel>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct ModelsDevModel {
|
||||
cost: Option<ModelsDevCost>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct ModelsDevCost {
|
||||
input: Option<f64>,
|
||||
output: Option<f64>,
|
||||
}
|
||||
|
||||
fn default_cost_url(provider: &CostProvider) -> &'static str {
|
||||
match provider {
|
||||
CostProvider::Digitalocean => DO_PRICING_URL,
|
||||
CostProvider::ModelsDev => MODELS_DEV_URL,
|
||||
}
|
||||
}
|
||||
|
||||
fn cost_provider_name(provider: &CostProvider) -> &'static str {
|
||||
match provider {
|
||||
CostProvider::Digitalocean => "digitalocean",
|
||||
CostProvider::ModelsDev => "models.dev",
|
||||
}
|
||||
}
|
||||
|
||||
async fn fetch_cost_pricing(
|
||||
provider: &CostProvider,
|
||||
url: &str,
|
||||
client: &reqwest::Client,
|
||||
aliases: &HashMap<String, String>,
|
||||
) -> HashMap<String, f64> {
|
||||
match client.get(DO_PRICING_URL).send().await {
|
||||
match provider {
|
||||
CostProvider::Digitalocean => fetch_do_pricing(url, client, aliases).await,
|
||||
CostProvider::ModelsDev => fetch_models_dev_pricing(url, client, aliases).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn fetch_do_pricing(
|
||||
url: &str,
|
||||
client: &reqwest::Client,
|
||||
aliases: &HashMap<String, String>,
|
||||
) -> HashMap<String, f64> {
|
||||
match client.get(url).send().await {
|
||||
Ok(resp) => match resp.json::<DoModelList>().await {
|
||||
Ok(list) => list
|
||||
.data
|
||||
|
|
@ -184,17 +236,66 @@ async fn fetch_do_pricing(
|
|||
})
|
||||
.collect(),
|
||||
Err(err) => {
|
||||
warn!(error = %err, url = DO_PRICING_URL, "failed to parse digitalocean pricing response");
|
||||
warn!(error = %err, url = %url, "failed to parse digitalocean pricing response");
|
||||
HashMap::new()
|
||||
}
|
||||
},
|
||||
Err(err) => {
|
||||
warn!(error = %err, url = DO_PRICING_URL, "failed to fetch digitalocean pricing");
|
||||
warn!(error = %err, url = %url, "failed to fetch digitalocean pricing");
|
||||
HashMap::new()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// models.dev publishes a top-level object keyed by provider id; each provider
|
||||
/// carries a `models` map whose keys are `creator/model` ids and whose `cost`
|
||||
/// block holds per-million USD rates. We sum input + output (mirroring the DO
|
||||
/// ranking metric) and key the result by `creator/model_id` so it lines up with
|
||||
/// Plano's `provider/model` routing names.
|
||||
async fn fetch_models_dev_pricing(
|
||||
url: &str,
|
||||
client: &reqwest::Client,
|
||||
aliases: &HashMap<String, String>,
|
||||
) -> HashMap<String, f64> {
|
||||
match client.get(url).send().await {
|
||||
Ok(resp) => match resp.json::<HashMap<String, ModelsDevProvider>>().await {
|
||||
Ok(providers) => parse_models_dev_pricing(providers, aliases),
|
||||
Err(err) => {
|
||||
warn!(error = %err, url = %url, "failed to parse models.dev pricing response");
|
||||
HashMap::new()
|
||||
}
|
||||
},
|
||||
Err(err) => {
|
||||
warn!(error = %err, url = %url, "failed to fetch models.dev pricing");
|
||||
HashMap::new()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_models_dev_pricing(
|
||||
providers: HashMap<String, ModelsDevProvider>,
|
||||
aliases: &HashMap<String, String>,
|
||||
) -> HashMap<String, f64> {
|
||||
let mut out = HashMap::new();
|
||||
for (provider_id, provider) in providers {
|
||||
for (model_key, model) in provider.models {
|
||||
let Some(cost) = model.cost else { continue };
|
||||
let (Some(input), Some(output)) = (cost.input, cost.output) else {
|
||||
continue;
|
||||
};
|
||||
// First-party providers use bare model keys (`claude-opus-4-5`),
|
||||
// so compose `provider/model` to line up with Plano routing names.
|
||||
let raw_key = format!("{provider_id}/{model_key}");
|
||||
let total = input + output;
|
||||
let key = aliases.get(&raw_key).cloned().unwrap_or(raw_key);
|
||||
out.insert(key, total);
|
||||
// Also register the bare model id as a fallback lookup.
|
||||
out.entry(model_key).or_insert(total);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct PrometheusResponse {
|
||||
data: PrometheusData,
|
||||
|
|
@ -368,6 +469,50 @@ mod tests {
|
|||
assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_models_dev_pricing_composes_provider_keys() {
|
||||
let json = r#"{
|
||||
"anthropic": {
|
||||
"models": {
|
||||
"claude-opus-4-5": {"cost": {"input": 5.0, "output": 25.0}}
|
||||
}
|
||||
},
|
||||
"groq": {
|
||||
"models": {
|
||||
"llama-3.3-70b-versatile": {"cost": {"input": 0.59, "output": 0.79}},
|
||||
"whisper-large-v3-turbo": {"cost": null}
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
let providers: HashMap<String, ModelsDevProvider> = serde_json::from_str(json).unwrap();
|
||||
let aliases = HashMap::new();
|
||||
let prices = parse_models_dev_pricing(providers, &aliases);
|
||||
|
||||
assert_eq!(prices.get("anthropic/claude-opus-4-5"), Some(&30.0));
|
||||
assert_eq!(prices.get("groq/llama-3.3-70b-versatile"), Some(&1.38));
|
||||
// bare fallback also registered
|
||||
assert_eq!(prices.get("claude-opus-4-5"), Some(&30.0));
|
||||
// models with no cost block are skipped
|
||||
assert!(!prices.contains_key("groq/whisper-large-v3-turbo"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_models_dev_pricing_applies_aliases() {
|
||||
let json = r#"{
|
||||
"openai": {"models": {"gpt-oss-120b": {"cost": {"input": 1.0, "output": 2.0}}}}
|
||||
}"#;
|
||||
let providers: HashMap<String, ModelsDevProvider> = serde_json::from_str(json).unwrap();
|
||||
let mut aliases = HashMap::new();
|
||||
aliases.insert(
|
||||
"openai/gpt-oss-120b".to_string(),
|
||||
"openai/gpt-4o".to_string(),
|
||||
);
|
||||
let prices = parse_models_dev_pricing(providers, &aliases);
|
||||
|
||||
assert_eq!(prices.get("openai/gpt-4o"), Some(&3.0));
|
||||
assert!(!prices.contains_key("openai/gpt-oss-120b"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rank_by_ascending_metric_nan_treated_as_missing() {
|
||||
let models = vec![
|
||||
|
|
|
|||
|
|
@ -177,8 +177,13 @@ pub enum MetricsSource {
|
|||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CostMetricsConfig {
|
||||
pub provider: CostProvider,
|
||||
/// Optional override for the pricing catalog endpoint. When omitted, a
|
||||
/// sensible default is used per provider.
|
||||
pub url: Option<String>,
|
||||
pub refresh_interval: Option<u64>,
|
||||
/// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
|
||||
/// Map catalog keys to Plano model names used in `routing_preferences`.
|
||||
/// DigitalOcean keys look like `lowercase(creator)/model_id`; models.dev
|
||||
/// keys look like `creator/model_id`.
|
||||
/// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
|
||||
pub model_aliases: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
|
@ -187,6 +192,8 @@ pub struct CostMetricsConfig {
|
|||
#[serde(rename_all = "snake_case")]
|
||||
pub enum CostProvider {
|
||||
Digitalocean,
|
||||
#[serde(rename = "models.dev")]
|
||||
ModelsDev,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
|
@ -741,6 +748,51 @@ mod test {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_models_dev_cost_source() {
|
||||
let yaml = r#"
|
||||
- type: cost
|
||||
provider: models.dev
|
||||
url: https://models.dev/api.json
|
||||
refresh_interval: 3600
|
||||
model_aliases:
|
||||
openai/gpt-oss-120b: openai/gpt-4o
|
||||
"#;
|
||||
let sources: Vec<super::MetricsSource> = serde_yaml::from_str(yaml).unwrap();
|
||||
assert_eq!(sources.len(), 1);
|
||||
match &sources[0] {
|
||||
super::MetricsSource::Cost(cfg) => {
|
||||
assert!(matches!(cfg.provider, super::CostProvider::ModelsDev));
|
||||
assert_eq!(cfg.url.as_deref(), Some("https://models.dev/api.json"));
|
||||
assert_eq!(cfg.refresh_interval, Some(3600));
|
||||
assert_eq!(
|
||||
cfg.model_aliases
|
||||
.as_ref()
|
||||
.and_then(|m| m.get("openai/gpt-oss-120b"))
|
||||
.map(String::as_str),
|
||||
Some("openai/gpt-4o")
|
||||
);
|
||||
}
|
||||
other => panic!("expected cost source, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_digitalocean_cost_source_without_url() {
|
||||
let yaml = r#"
|
||||
- type: cost
|
||||
provider: digitalocean
|
||||
"#;
|
||||
let sources: Vec<super::MetricsSource> = serde_yaml::from_str(yaml).unwrap();
|
||||
match &sources[0] {
|
||||
super::MetricsSource::Cost(cfg) => {
|
||||
assert!(matches!(cfg.provider, super::CostProvider::Digitalocean));
|
||||
assert_eq!(cfg.url, None);
|
||||
}
|
||||
other => panic!("expected cost source, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_into_models_filters_internal_providers() {
|
||||
let providers = vec![
|
||||
|
|
|
|||
|
|
@ -209,6 +209,178 @@ Clients can let the router decide or still specify aliases:
|
|||
)
|
||||
|
||||
|
||||
.. _cost_latency_aware_selection:
|
||||
|
||||
Cost- and latency-aware selection
|
||||
---------------------------------
|
||||
|
||||
When a route lists more than one candidate model, you can let Plano reorder that
|
||||
candidate pool using **live cost or latency data** instead of relying solely on the
|
||||
order you wrote them in. This is controlled per route with ``selection_policy`` and
|
||||
backed by one or more ``model_metrics_sources``.
|
||||
|
||||
This is useful when several models are equally capable for a route and you want Plano
|
||||
to always reach for the cheapest (or fastest) option first, with the others kept as
|
||||
fallbacks.
|
||||
|
||||
Selection policy
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
Attach an optional ``selection_policy`` to any entry in ``routing_preferences``:
|
||||
|
||||
.. code-block:: yaml
|
||||
:caption: Per-route selection policy
|
||||
|
||||
routing_preferences:
|
||||
- name: code review
|
||||
description: reviewing, analyzing, and suggesting improvements to existing code
|
||||
models:
|
||||
- anthropic/claude-sonnet-4-5
|
||||
- groq/llama-3.3-70b-versatile
|
||||
selection_policy:
|
||||
prefer: cheapest # cheapest | fastest | none
|
||||
|
||||
``prefer`` accepts:
|
||||
|
||||
- ``cheapest`` — order candidates by total price (input + output rate) ascending, using a ``cost`` metrics source.
|
||||
- ``fastest`` — order candidates by observed latency ascending, using a ``latency`` metrics source.
|
||||
- ``none`` (default) — keep the order you declared; no reordering.
|
||||
|
||||
Models that have no data in the selected source are ranked **last**, in their original
|
||||
order, so routing always degrades gracefully rather than dropping a candidate.
|
||||
|
||||
Configuring the pricing source
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
``cheapest`` routing needs a price catalog. Plano's **default pricing provider is
|
||||
DigitalOcean** — its GenAI model catalog is public (no API key, no signup), so cost data
|
||||
is available out of the box and is what ``planoai obs`` uses if you don't configure
|
||||
anything. The pricing source is fully swappable: point Plano at `models.dev <https://models.dev/>`_,
|
||||
or at **any endpoint that exposes a supported pricing structure**.
|
||||
|
||||
The ``provider`` field selects which response schema Plano expects (and therefore how it
|
||||
parses the catalog); the optional ``url`` lets you override the endpoint — for example to
|
||||
use a mirror, a cached copy, or an internal catalog service that returns the same shape.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 18 34 28 20
|
||||
|
||||
* - ``provider``
|
||||
- Default catalog URL
|
||||
- Key format
|
||||
- Expected structure
|
||||
* - ``digitalocean`` *(default)*
|
||||
- DigitalOcean GenAI model catalog
|
||||
- ``lowercase(creator)/model_id``
|
||||
- ``{ data: [ { model_id, pricing: { input_price_per_million, output_price_per_million } } ] }``
|
||||
* - ``models.dev``
|
||||
- ``https://models.dev/api.json``
|
||||
- ``creator/model`` (e.g. ``anthropic/claude-sonnet-4-5``)
|
||||
- ``{ <provider>: { models: { <model>: { cost: { input, output } } } } }``
|
||||
|
||||
Because the source is selected per ``provider``, switching is a one-line change. To stay
|
||||
on the default DigitalOcean catalog you can omit ``model_metrics_sources`` entirely for
|
||||
``planoai obs``, or declare it explicitly for routing:
|
||||
|
||||
.. code-block:: yaml
|
||||
:caption: Default cost source (DigitalOcean)
|
||||
|
||||
model_metrics_sources:
|
||||
- type: cost
|
||||
provider: digitalocean # default; uses the public DO GenAI catalog
|
||||
|
||||
To switch to models.dev — an open, community-maintained catalog covering a broad range of
|
||||
providers and models — change the ``provider`` (and optionally ``url``):
|
||||
|
||||
.. code-block:: yaml
|
||||
:caption: Cost source backed by models.dev
|
||||
|
||||
model_metrics_sources:
|
||||
- type: cost
|
||||
provider: models.dev # models.dev | digitalocean
|
||||
url: https://models.dev/api.json # optional; defaults per provider
|
||||
refresh_interval: 3600 # optional, seconds; refetch on this interval
|
||||
model_aliases: # optional; see below
|
||||
openai/gpt-oss-120b: openai/gpt-4o
|
||||
|
||||
To use your own endpoint, pick the ``provider`` whose structure your endpoint matches and
|
||||
override ``url`` — Plano parses the response with that provider's schema:
|
||||
|
||||
.. code-block:: yaml
|
||||
:caption: Custom endpoint exposing the DigitalOcean catalog structure
|
||||
|
||||
model_metrics_sources:
|
||||
- type: cost
|
||||
provider: digitalocean # selects the DO response schema
|
||||
url: https://catalog.internal.example.com/pricing
|
||||
|
||||
.. note::
|
||||
The cost metric used for ranking is the sum of the input and output per-million-token
|
||||
rates — a relative signal for ordering candidates, not a per-request bill. For actual
|
||||
per-request cost, see the observability console below.
|
||||
|
||||
Matching catalog keys to your models
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The router looks up each candidate model by the exact name you use in
|
||||
``routing_preferences`` (e.g. ``anthropic/claude-sonnet-4-5``). models.dev keys models as
|
||||
``creator/model``, which lines up with Plano's ``provider/model`` naming, so most models
|
||||
match automatically.
|
||||
|
||||
When a catalog key does not match your model name — for example a version skew, or an
|
||||
open-weight model you serve under a different provider — use ``model_aliases`` to map the
|
||||
**catalog key** to the **Plano model name** used in your routing preferences:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
model_metrics_sources:
|
||||
- type: cost
|
||||
provider: models.dev
|
||||
model_aliases:
|
||||
# catalog key : plano model name
|
||||
openai/gpt-oss-120b: openai/gpt-4o
|
||||
|
||||
Latency source
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
``fastest`` routing reads observed latency from a Prometheus instance. Provide the query
|
||||
that returns a per-model latency value (lower is faster), labelled by ``model_name``:
|
||||
|
||||
.. code-block:: yaml
|
||||
:caption: Latency source backed by Prometheus
|
||||
|
||||
model_metrics_sources:
|
||||
- type: latency
|
||||
provider: prometheus
|
||||
url: http://prometheus:9090
|
||||
query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m]))
|
||||
refresh_interval: 60
|
||||
|
||||
You can declare both a ``cost`` and a ``latency`` source at the same time; each route
|
||||
picks whichever it needs based on its ``selection_policy``.
|
||||
|
||||
Cost in the observability console
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
``planoai obs`` displays a per-request USD cost column derived from the same pricing
|
||||
catalog. By default it reads the ``cost`` source from your config (the first
|
||||
``type: cost`` entry under ``model_metrics_sources``); you can also override it on the
|
||||
command line:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Use the cost source from ./config.yaml (default)
|
||||
planoai obs
|
||||
|
||||
# Or override the provider / endpoint explicitly
|
||||
planoai obs --pricing-provider models.dev
|
||||
planoai obs --pricing-url https://models.dev/api.json
|
||||
|
||||
If no source is configured and no override is given, ``planoai obs`` falls back to the
|
||||
DigitalOcean catalog so the cost column still populates out of the box.
|
||||
|
||||
|
||||
Plano-Orchestrator
|
||||
-------------------
|
||||
Plano-Orchestrator is a **preference-based routing model** specifically designed to address the limitations of traditional LLM routing. It delivers production-ready performance with low latency and high accuracy while solving key routing challenges.
|
||||
|
|
|
|||
|
|
@ -86,6 +86,24 @@ routing_preferences:
|
|||
selection_policy:
|
||||
prefer: cheapest
|
||||
|
||||
# model_metrics_sources: external catalogs the router reads to reorder candidate
|
||||
# models for selection_policy.prefer. A `cost` source ranks `prefer: cheapest`;
|
||||
# a `latency` source ranks `prefer: fastest`. Both are optional.
|
||||
model_metrics_sources:
|
||||
# Cost catalog. provider: models.dev | digitalocean (default url per provider).
|
||||
- type: cost
|
||||
provider: models.dev
|
||||
url: https://models.dev/api.json # optional; omit to use the provider default
|
||||
refresh_interval: 3600 # optional, seconds
|
||||
model_aliases: # optional: catalog key -> Plano model name
|
||||
openai/gpt-oss-120b: openai/gpt-4o
|
||||
# Latency catalog (Prometheus). Used for selection_policy.prefer: fastest.
|
||||
- type: latency
|
||||
provider: prometheus
|
||||
url: http://prometheus:9090
|
||||
query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m]))
|
||||
refresh_interval: 60
|
||||
|
||||
# HTTP listeners - entry points for agent routing, prompt targets, and direct LLM access
|
||||
listeners:
|
||||
# Agent listener for routing requests to multiple agents
|
||||
|
|
|
|||
|
|
@ -115,6 +115,18 @@ model_aliases:
|
|||
target: gpt-4o-mini
|
||||
smart-llm:
|
||||
target: gpt-4o
|
||||
model_metrics_sources:
|
||||
- model_aliases:
|
||||
openai/gpt-oss-120b: openai/gpt-4o
|
||||
provider: models.dev
|
||||
refresh_interval: 3600
|
||||
type: cost
|
||||
url: https://models.dev/api.json
|
||||
- provider: prometheus
|
||||
query: avg by (model_name) (rate(plano_llm_latency_seconds_sum[5m]))
|
||||
refresh_interval: 60
|
||||
type: latency
|
||||
url: http://prometheus:9090
|
||||
model_providers:
|
||||
- access_key: $OPENAI_API_KEY
|
||||
default: true
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue