merge main into plano-session_pinning

2026-05-15 11:02:39 +02:00 · 2026-04-02 22:58:47 -07:00 · 2026-04-02 22:58:47 -07:00 · f699cfb059
commit f699cfb059
parent 0105897692 aa16a6dc4b
86 changed files with 11996 additions and 8063 deletions
--- a/crates/common/src/api/open_ai.rs
+++ b/crates/common/src/api/open_ai.rs
@ -2,7 +2,6 @@ use crate::{
    configuration::LlmProvider,
    consts::{ARCH_FC_MODEL_NAME, ASSISTANT_ROLE},
 };
-use core::{panic, str};
 use serde::{ser::SerializeMap, Deserialize, Serialize};
 use std::{
    collections::{HashMap, VecDeque},
@ -193,7 +192,7 @@ impl Display for ContentType {
                            // skip image URLs or their data in text representation
                            None
                        } else {
-                            panic!("Unsupported content type: {:?}", part.content_type);
+                            None
                        }
                    })
                    .collect();
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -1,5 +1,5 @@
 use hermesllm::apis::openai::{ModelDetail, ModelObject, Models};
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize};
 use std::collections::HashMap;
 use std::fmt::Display;

@ -112,6 +112,77 @@ pub enum StateStorageType {
    Postgres,
 }

+#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "lowercase")]
+pub enum SelectionPreference {
+    Cheapest,
+    Fastest,
+    /// Return models in the same order they were defined — no reordering.
+    #[default]
+    #[serde(alias = "")]
+    None,
+}
+
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct SelectionPolicy {
+    #[serde(default, deserialize_with = "deserialize_selection_preference")]
+    pub prefer: SelectionPreference,
+}
+
+fn deserialize_selection_preference<'de, D>(
+    deserializer: D,
+) -> Result<SelectionPreference, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    Ok(Option::<SelectionPreference>::deserialize(deserializer)?.unwrap_or_default())
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TopLevelRoutingPreference {
+    pub name: String,
+    pub description: String,
+    pub models: Vec<String>,
+    #[serde(default)]
+    pub selection_policy: SelectionPolicy,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum MetricsSource {
+    Cost(CostMetricsConfig),
+    Latency(LatencyMetricsConfig),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CostMetricsConfig {
+    pub provider: CostProvider,
+    pub refresh_interval: Option<u64>,
+    /// Map DO catalog keys (`lowercase(creator)/model_id`) to Plano model names.
+    /// Example: `openai/openai-gpt-oss-120b: openai/gpt-4o`
+    pub model_aliases: Option<HashMap<String, String>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum CostProvider {
+    Digitalocean,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LatencyMetricsConfig {
+    pub provider: LatencyProvider,
+    pub url: String,
+    pub query: String,
+    pub refresh_interval: Option<u64>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum LatencyProvider {
+    Prometheus,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Configuration {
    pub version: String,
@ -131,6 +202,8 @@ pub struct Configuration {
    pub filters: Option<Vec<Agent>>,
    pub listeners: Vec<Listener>,
    pub state_storage: Option<StateStorageConfig>,
+    pub routing_preferences: Option<Vec<TopLevelRoutingPreference>>,
+    pub model_metrics_sources: Option<Vec<MetricsSource>>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -246,6 +319,8 @@ pub enum TimeUnit {
    Minute,
    #[serde(rename = "hour")]
    Hour,
+    #[serde(rename = "day")]
+    Day,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
@ -326,18 +401,6 @@ impl LlmProviderType {
    }
 }

-#[derive(Serialize, Deserialize, Debug)]
-pub struct ModelUsagePreference {
-    pub model: String,
-    pub routing_preferences: Vec<RoutingPreference>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct RoutingPreference {
-    pub name: String,
-    pub description: String,
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct AgentUsagePreference {
    pub model: String,
@ -387,7 +450,6 @@ pub struct LlmProvider {
    pub port: Option<u16>,
    pub rate_limits: Option<LlmRatelimit>,
    pub usage: Option<String>,
-    pub routing_preferences: Option<Vec<RoutingPreference>>,
    pub cluster_name: Option<String>,
    pub base_url_path_prefix: Option<String>,
    pub internal: Option<bool>,
@ -431,7 +493,6 @@ impl Default for LlmProvider {
            port: None,
            rate_limits: None,
            usage: None,
-            routing_preferences: None,
            cluster_name: None,
            base_url_path_prefix: None,
            internal: None,
--- a/crates/common/src/http.rs
+++ b/crates/common/src/http.rs
@ -75,7 +75,10 @@ pub trait Client: Context {
    fn add_call_context(&self, id: u32, call_context: Self::CallContext) {
        let callouts = self.callouts();
        if callouts.borrow_mut().insert(id, call_context).is_some() {
-            panic!("Duplicate http call with id={}", id);
+            log::warn!(
+                "Duplicate http call with id={}, previous context overwritten",
+                id
+            );
        }
        self.active_http_calls().increment(1);
    }
--- a/crates/common/src/llm_providers.rs
+++ b/crates/common/src/llm_providers.rs
@ -274,7 +274,6 @@ mod tests {
            port: None,
            rate_limits: None,
            usage: None,
-            routing_preferences: None,
            internal: None,
            stream: None,
            passthrough_auth: None,
--- a/crates/common/src/ratelimit.rs
+++ b/crates/common/src/ratelimit.rs
@ -73,7 +73,10 @@ impl RatelimitMap {
            match new_ratelimit_map.datastore.get_mut(&ratelimit_config.model) {
                Some(limits) => match limits.get_mut(&ratelimit_config.selector) {
                    Some(_) => {
-                        panic!("repeated selector. Selectors per provider must be unique")
+                        log::error!(
+                            "repeated selector for model '{}'. Selectors per provider must be unique, skipping duplicate",
+                            ratelimit_config.model
+                        );
                    }
                    None => {
                        limits.insert(ratelimit_config.selector, limit);
@ -150,6 +153,10 @@ fn get_quota(limit: Limit) -> Quota {
        TimeUnit::Second => Quota::per_second(tokens),
        TimeUnit::Minute => Quota::per_minute(tokens),
        TimeUnit::Hour => Quota::per_hour(tokens),
+        TimeUnit::Day => {
+            let per_hour = limit.tokens.saturating_div(24).max(1);
+            Quota::per_hour(NonZero::new(per_hour).expect("per_hour must be positive"))
+        }
    }
 }