plano/crates/brightstaff/src/router/model_metrics.rs

388 lines
14 KiB
Rust

use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use common::configuration::{
CostProvider, LatencyProvider, MetricsSource, SelectionPolicy, SelectionPreference,
};
use tokio::sync::RwLock;
use tracing::{debug, info, warn};
const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog";
pub struct ModelMetricsService {
cost: Arc<RwLock<HashMap<String, f64>>>,
latency: Arc<RwLock<HashMap<String, f64>>>,
}
impl ModelMetricsService {
pub async fn new(sources: &[MetricsSource], client: reqwest::Client) -> Self {
let cost_data = Arc::new(RwLock::new(HashMap::new()));
let latency_data = Arc::new(RwLock::new(HashMap::new()));
for source in sources {
match source {
MetricsSource::Cost(cfg) => match cfg.provider {
CostProvider::Digitalocean => {
let aliases = cfg.model_aliases.clone().unwrap_or_default();
let data = fetch_do_pricing(&client, &aliases).await;
info!(models = data.len(), "fetched digitalocean pricing");
*cost_data.write().await = data;
if let Some(interval_secs) = cfg.refresh_interval {
let cost_clone = Arc::clone(&cost_data);
let client_clone = client.clone();
let interval = Duration::from_secs(interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data = fetch_do_pricing(&client_clone, &aliases).await;
info!(models = data.len(), "refreshed digitalocean pricing");
*cost_clone.write().await = data;
}
});
}
}
},
MetricsSource::Latency(cfg) => match cfg.provider {
LatencyProvider::Prometheus => {
let data = fetch_prometheus_metrics(&cfg.url, &cfg.query, &client).await;
info!(models = data.len(), url = %cfg.url, "fetched latency metrics");
*latency_data.write().await = data;
if let Some(interval_secs) = cfg.refresh_interval {
let latency_clone = Arc::clone(&latency_data);
let client_clone = client.clone();
let url = cfg.url.clone();
let query = cfg.query.clone();
let interval = Duration::from_secs(interval_secs);
tokio::spawn(async move {
loop {
tokio::time::sleep(interval).await;
let data =
fetch_prometheus_metrics(&url, &query, &client_clone).await;
info!(models = data.len(), url = %url, "refreshed latency metrics");
*latency_clone.write().await = data;
}
});
}
}
},
}
}
ModelMetricsService {
cost: cost_data,
latency: latency_data,
}
}
/// Rank `models` by `policy`, returning them in preference order.
/// Models with no metric data are appended at the end in their original order.
pub async fn rank_models(&self, models: &[String], policy: &SelectionPolicy) -> Vec<String> {
let cost_data = self.cost.read().await;
let latency_data = self.latency.read().await;
debug!(
input_models = ?models,
cost_data = ?cost_data.iter().collect::<Vec<_>>(),
latency_data = ?latency_data.iter().collect::<Vec<_>>(),
prefer = ?policy.prefer,
"rank_models called"
);
match policy.prefer {
SelectionPreference::Cheapest => {
for m in models {
if !cost_data.contains_key(m.as_str()) {
warn!(model = %m, "no cost data for model — ranking last (prefer: cheapest)");
}
}
rank_by_ascending_metric(models, &cost_data)
}
SelectionPreference::Fastest => {
for m in models {
if !latency_data.contains_key(m.as_str()) {
warn!(model = %m, "no latency data for model — ranking last (prefer: fastest)");
}
}
rank_by_ascending_metric(models, &latency_data)
}
SelectionPreference::None => models.to_vec(),
}
}
/// Returns a snapshot of the current cost data. Used at startup to warn about unmatched models.
pub async fn cost_snapshot(&self) -> HashMap<String, f64> {
self.cost.read().await.clone()
}
/// Returns a snapshot of the current latency data. Used at startup to warn about unmatched models.
pub async fn latency_snapshot(&self) -> HashMap<String, f64> {
self.latency.read().await.clone()
}
}
fn rank_by_ascending_metric(models: &[String], data: &HashMap<String, f64>) -> Vec<String> {
let mut with_data: Vec<(&String, f64)> = models
.iter()
.filter_map(|m| {
let v = *data.get(m.as_str())?;
if v.is_nan() {
None
} else {
Some((m, v))
}
})
.collect();
with_data.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let without_data: Vec<&String> = models
.iter()
.filter(|m| data.get(m.as_str()).is_none_or(|v| v.is_nan()))
.collect();
with_data
.iter()
.map(|(m, _)| (*m).clone())
.chain(without_data.iter().map(|m| (*m).clone()))
.collect()
}
#[derive(serde::Deserialize)]
struct DoModelList {
data: Vec<DoModel>,
}
#[derive(serde::Deserialize)]
struct DoModel {
model_id: String,
pricing: Option<DoPricing>,
}
#[derive(serde::Deserialize)]
struct DoPricing {
input_price_per_million: Option<f64>,
output_price_per_million: Option<f64>,
}
async fn fetch_do_pricing(
client: &reqwest::Client,
aliases: &HashMap<String, String>,
) -> HashMap<String, f64> {
match client.get(DO_PRICING_URL).send().await {
Ok(resp) => match resp.json::<DoModelList>().await {
Ok(list) => list
.data
.into_iter()
.filter_map(|m| {
let pricing = m.pricing?;
let raw_key = m.model_id.clone();
let key = aliases.get(&raw_key).cloned().unwrap_or(raw_key);
let cost = pricing.input_price_per_million.unwrap_or(0.0)
+ pricing.output_price_per_million.unwrap_or(0.0);
Some((key, cost))
})
.collect(),
Err(err) => {
warn!(error = %err, url = DO_PRICING_URL, "failed to parse digitalocean pricing response");
HashMap::new()
}
},
Err(err) => {
warn!(error = %err, url = DO_PRICING_URL, "failed to fetch digitalocean pricing");
HashMap::new()
}
}
}
#[derive(serde::Deserialize)]
struct PrometheusResponse {
data: PrometheusData,
}
#[derive(serde::Deserialize)]
struct PrometheusData {
result: Vec<PrometheusResult>,
}
#[derive(serde::Deserialize)]
struct PrometheusResult {
metric: HashMap<String, String>,
value: (f64, String), // (timestamp, value_str)
}
async fn fetch_prometheus_metrics(
url: &str,
query: &str,
client: &reqwest::Client,
) -> HashMap<String, f64> {
let query_url = format!("{}/api/v1/query", url.trim_end_matches('/'));
match client
.get(&query_url)
.query(&[("query", query)])
.send()
.await
{
Ok(resp) => match resp.json::<PrometheusResponse>().await {
Ok(prom) => prom
.data
.result
.into_iter()
.filter_map(|r| {
let model_name = r.metric.get("model_name")?.clone();
let value: f64 = r.value.1.parse().ok()?;
Some((model_name, value))
})
.collect(),
Err(err) => {
warn!(error = %err, url = %query_url, "failed to parse prometheus response");
HashMap::new()
}
},
Err(err) => {
warn!(error = %err, url = %query_url, "failed to fetch prometheus metrics");
HashMap::new()
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use common::configuration::SelectionPreference;
fn make_policy(prefer: SelectionPreference) -> SelectionPolicy {
SelectionPolicy { prefer }
}
#[test]
fn test_rank_by_ascending_metric_picks_lowest_first() {
let models = vec!["a".to_string(), "b".to_string(), "c".to_string()];
let mut data = HashMap::new();
data.insert("a".to_string(), 0.01);
data.insert("b".to_string(), 0.005);
data.insert("c".to_string(), 0.02);
assert_eq!(
rank_by_ascending_metric(&models, &data),
vec!["b", "a", "c"]
);
}
#[test]
fn test_rank_by_ascending_metric_no_data_preserves_order() {
let models = vec!["x".to_string(), "y".to_string()];
let data = HashMap::new();
assert_eq!(rank_by_ascending_metric(&models, &data), vec!["x", "y"]);
}
#[test]
fn test_rank_by_ascending_metric_partial_data() {
let models = vec!["a".to_string(), "b".to_string()];
let mut data = HashMap::new();
data.insert("b".to_string(), 100.0);
assert_eq!(rank_by_ascending_metric(&models, &data), vec!["b", "a"]);
}
#[tokio::test]
async fn test_rank_models_cheapest() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new({
let mut m = HashMap::new();
m.insert("gpt-4o".to_string(), 0.005);
m.insert("gpt-4o-mini".to_string(), 0.0001);
m
})),
latency: Arc::new(RwLock::new(HashMap::new())),
};
let models = vec!["gpt-4o".to_string(), "gpt-4o-mini".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::Cheapest))
.await;
assert_eq!(result, vec!["gpt-4o-mini", "gpt-4o"]);
}
#[tokio::test]
async fn test_rank_models_fastest() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new(HashMap::new())),
latency: Arc::new(RwLock::new({
let mut m = HashMap::new();
m.insert("gpt-4o".to_string(), 200.0);
m.insert("claude-sonnet".to_string(), 120.0);
m
})),
};
let models = vec!["gpt-4o".to_string(), "claude-sonnet".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::Fastest))
.await;
assert_eq!(result, vec!["claude-sonnet", "gpt-4o"]);
}
#[tokio::test]
async fn test_rank_models_fallback_no_metrics() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new(HashMap::new())),
latency: Arc::new(RwLock::new(HashMap::new())),
};
let models = vec!["model-a".to_string(), "model-b".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::Cheapest))
.await;
assert_eq!(result, vec!["model-a", "model-b"]);
}
#[tokio::test]
async fn test_rank_models_partial_data_appended_last() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new({
let mut m = HashMap::new();
m.insert("gpt-4o".to_string(), 0.005);
m
})),
latency: Arc::new(RwLock::new(HashMap::new())),
};
let models = vec!["gpt-4o-mini".to_string(), "gpt-4o".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::Cheapest))
.await;
assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
}
#[tokio::test]
async fn test_rank_models_none_preserves_order() {
let service = ModelMetricsService {
cost: Arc::new(RwLock::new({
let mut m = HashMap::new();
m.insert("gpt-4o-mini".to_string(), 0.0001);
m.insert("gpt-4o".to_string(), 0.005);
m
})),
latency: Arc::new(RwLock::new(HashMap::new())),
};
let models = vec!["gpt-4o".to_string(), "gpt-4o-mini".to_string()];
let result = service
.rank_models(&models, &make_policy(SelectionPreference::None))
.await;
// none → original order, despite gpt-4o-mini being cheaper
assert_eq!(result, vec!["gpt-4o", "gpt-4o-mini"]);
}
#[test]
fn test_rank_by_ascending_metric_nan_treated_as_missing() {
let models = vec![
"a".to_string(),
"b".to_string(),
"c".to_string(),
"d".to_string(),
];
let mut data = HashMap::new();
data.insert("a".to_string(), f64::NAN);
data.insert("b".to_string(), 0.5);
data.insert("c".to_string(), 0.1);
// "d" has no entry at all
let result = rank_by_ascending_metric(&models, &data);
// c (0.1) < b (0.5), then NaN "a" and missing "d" appended in original order
assert_eq!(result, vec!["c", "b", "a", "d"]);
}
}