merge main into plano-session_pinning

This commit is contained in:
Adil Hafeez 2026-04-02 22:58:47 -07:00
commit f699cfb059
86 changed files with 11996 additions and 8063 deletions

View file

@ -2,7 +2,6 @@ use crate::{
configuration::LlmProvider,
consts::{ARCH_FC_MODEL_NAME, ASSISTANT_ROLE},
};
use core::{panic, str};
use serde::{ser::SerializeMap, Deserialize, Serialize};
use std::{
collections::{HashMap, VecDeque},
@ -193,7 +192,7 @@ impl Display for ContentType {
// skip image URLs or their data in text representation
None
} else {
panic!("Unsupported content type: {:?}", part.content_type);
None
}
})
.collect();

View file

@ -1,5 +1,5 @@
use hermesllm::apis::openai::{ModelDetail, ModelObject, Models};
use serde::{Deserialize, Serialize};
use serde::{Deserialize, Deserializer, Serialize};
use std::collections::HashMap;
use std::fmt::Display;
@ -112,6 +112,77 @@ pub enum StateStorageType {
Postgres,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum SelectionPreference {
Cheapest,
Fastest,
/// Return models in the same order they were defined — no reordering.
#[default]
#[serde(alias = "")]
None,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct SelectionPolicy {
#[serde(default, deserialize_with = "deserialize_selection_preference")]
pub prefer: SelectionPreference,
}
fn deserialize_selection_preference<'de, D>(
deserializer: D,
) -> Result<SelectionPreference, D::Error>
where
D: Deserializer<'de>,
{
Ok(Option::<SelectionPreference>::deserialize(deserializer)?.unwrap_or_default())
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TopLevelRoutingPreference {
pub name: String,
pub description: String,
pub models: Vec<String>,
#[serde(default)]
pub selection_policy: SelectionPolicy,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum MetricsSource {
Cost(CostMetricsConfig),
Latency(LatencyMetricsConfig),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CostMetricsConfig {
pub provider: CostProvider,
pub refresh_interval: Option<u64>,
/// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
/// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
pub model_aliases: Option<HashMap<String, String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum CostProvider {
Digitalocean,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyMetricsConfig {
pub provider: LatencyProvider,
pub url: String,
pub query: String,
pub refresh_interval: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum LatencyProvider {
Prometheus,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Configuration {
pub version: String,
@ -131,6 +202,8 @@ pub struct Configuration {
pub filters: Option<Vec<Agent>>,
pub listeners: Vec<Listener>,
pub state_storage: Option<StateStorageConfig>,
pub routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
pub model_metrics_sources: Option<Vec<MetricsSource>>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -246,6 +319,8 @@ pub enum TimeUnit {
Minute,
#[serde(rename = "hour")]
Hour,
#[serde(rename = "day")]
Day,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
@ -326,18 +401,6 @@ impl LlmProviderType {
}
}
#[derive(Serialize, Deserialize, Debug)]
pub struct ModelUsagePreference {
pub model: String,
pub routing_preferences: Vec<RoutingPreference>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RoutingPreference {
pub name: String,
pub description: String,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct AgentUsagePreference {
pub model: String,
@ -387,7 +450,6 @@ pub struct LlmProvider {
pub port: Option<u16>,
pub rate_limits: Option<LlmRatelimit>,
pub usage: Option<String>,
pub routing_preferences: Option<Vec<RoutingPreference>>,
pub cluster_name: Option<String>,
pub base_url_path_prefix: Option<String>,
pub internal: Option<bool>,
@ -431,7 +493,6 @@ impl Default for LlmProvider {
port: None,
rate_limits: None,
usage: None,
routing_preferences: None,
cluster_name: None,
base_url_path_prefix: None,
internal: None,

View file

@ -75,7 +75,10 @@ pub trait Client: Context {
fn add_call_context(&self, id: u32, call_context: Self::CallContext) {
let callouts = self.callouts();
if callouts.borrow_mut().insert(id, call_context).is_some() {
panic!("Duplicate http call with id={}", id);
log::warn!(
"Duplicate http call with id={}, previous context overwritten",
id
);
}
self.active_http_calls().increment(1);
}

View file

@ -274,7 +274,6 @@ mod tests {
port: None,
rate_limits: None,
usage: None,
routing_preferences: None,
internal: None,
stream: None,
passthrough_auth: None,

View file

@ -73,7 +73,10 @@ impl RatelimitMap {
match new_ratelimit_map.datastore.get_mut(&ratelimit_config.model) {
Some(limits) => match limits.get_mut(&ratelimit_config.selector) {
Some(_) => {
panic!("repeated selector. Selectors per provider must be unique")
log::error!(
"repeated selector for model '{}'. Selectors per provider must be unique, skipping duplicate",
ratelimit_config.model
);
}
None => {
limits.insert(ratelimit_config.selector, limit);
@ -150,6 +153,10 @@ fn get_quota(limit: Limit) -> Quota {
TimeUnit::Second => Quota::per_second(tokens),
TimeUnit::Minute => Quota::per_minute(tokens),
TimeUnit::Hour => Quota::per_hour(tokens),
TimeUnit::Day => {
let per_hour = limit.tokens.saturating_div(24).max(1);
Quota::per_hour(NonZero::new(per_hour).expect("per_hour must be positive"))
}
}
}