diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 24a60c14..e0970c42 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -187,11 +187,15 @@ async fn init_app_state( } // Validate that all models referenced in top-level routing_preferences exist in model_providers. + // The CLI renders model_providers with `name` = "openai/gpt-4o" and `model` = "gpt-4o", + // so we accept a match against either field. if let Some(ref route_prefs) = config.routing_preferences { let provider_model_names: std::collections::HashSet<&str> = config .model_providers .iter() - .flat_map(|p| p.model.as_deref()) + .flat_map(|p| { + std::iter::once(p.name.as_str()).chain(p.model.as_deref()) + }) .collect(); for pref in route_prefs { for model in &pref.models { diff --git a/crates/brightstaff/src/router/model_metrics.rs b/crates/brightstaff/src/router/model_metrics.rs index 078b8938..604ed2a4 100644 --- a/crates/brightstaff/src/router/model_metrics.rs +++ b/crates/brightstaff/src/router/model_metrics.rs @@ -6,7 +6,7 @@ use common::configuration::{MetricsSource, SelectionPolicy, SelectionPreference} use tokio::sync::RwLock; use tracing::{info, warn}; -const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models"; +const DO_PRICING_URL: &str = "https://api.digitalocean.com/v2/gen-ai/models/catalog"; pub struct ModelMetricsService { cost: Arc>>, diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 1e4c1985..befbc9f2 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -147,6 +147,7 @@ pub enum MetricsSource { query: String, refresh_interval: Option, }, + #[serde(rename = "digitalocean_pricing")] DigitalOceanPricing { refresh_interval: Option, }, diff --git a/demos/llm_routing/model_routing_service/config.yaml b/demos/llm_routing/model_routing_service/config.yaml index 34ae2f50..543f3902 100644 --- a/demos/llm_routing/model_routing_service/config.yaml +++ b/demos/llm_routing/model_routing_service/config.yaml @@ -34,13 +34,12 @@ routing_preferences: prefer: fastest model_metrics_sources: - - type: digitalocean_pricing - refresh_interval: 3600 + - type: cost_metrics + url: http://localhost:8080/costs + refresh_interval: 300 - type: prometheus_metrics url: http://localhost:9090 query: model_latency_p95_seconds refresh_interval: 60 -tracing: - random_sampling: 100 diff --git a/demos/llm_routing/model_routing_service/demo.sh b/demos/llm_routing/model_routing_service/demo.sh index 0c3fdc5d..3ad102f1 100755 --- a/demos/llm_routing/model_routing_service/demo.sh +++ b/demos/llm_routing/model_routing_service/demo.sh @@ -8,9 +8,12 @@ echo "" echo "This demo shows how to use the /routing/v1/* endpoints to get" echo "routing decisions without actually proxying the request to an LLM." echo "" +echo "The response includes a ranked 'models' list — use models[0] as the" +echo "primary and fall back to models[1] on 429/5xx errors." +echo "" -# --- Example 1: OpenAI Chat Completions format --- -echo "--- 1. Code generation query (OpenAI format) ---" +# --- Example 1: Code generation (ranked by fastest) --- +echo "--- 1. Code generation query (prefer: fastest) ---" echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ @@ -22,8 +25,8 @@ curl -s "$PLANO_URL/routing/v1/chat/completions" \ }' | python3 -m json.tool echo "" -# --- Example 2: Complex reasoning query --- -echo "--- 2. Complex reasoning query (OpenAI format) ---" +# --- Example 2: Complex reasoning (ranked by cheapest) --- +echo "--- 2. Complex reasoning query (prefer: cheapest) ---" echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ @@ -36,7 +39,7 @@ curl -s "$PLANO_URL/routing/v1/chat/completions" \ echo "" # --- Example 3: Simple query (no routing match) --- -echo "--- 3. Simple query - no routing match (OpenAI format) ---" +echo "--- 3. Simple query - no routing match (falls back to request model) ---" echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ @@ -62,8 +65,31 @@ curl -s "$PLANO_URL/routing/v1/messages" \ }' | python3 -m json.tool echo "" -# --- Example 5: Inline routing policy in request body --- -echo "--- 5. Inline routing_policy (no config needed) ---" +# --- Example 5: Inline routing_preferences with prefer:cheapest --- +echo "--- 5. Inline routing_preferences (prefer: cheapest) ---" +echo " models[] will be sorted by ascending cost from DigitalOcean pricing" +echo "" +curl -s "$PLANO_URL/routing/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Summarize the key differences between TCP and UDP"} + ], + "routing_preferences": [ + { + "name": "general", + "description": "general questions, explanations, and summaries", + "models": ["openai/gpt-4o", "openai/gpt-4o-mini"], + "selection_policy": {"prefer": "cheapest"} + } + ] + }' | python3 -m json.tool +echo "" + +# --- Example 6: Inline routing_preferences with prefer:fastest --- +echo "--- 6. Inline routing_preferences (prefer: fastest) ---" +echo " models[] will be sorted by ascending P95 latency from Prometheus" echo "" curl -s "$PLANO_URL/routing/v1/chat/completions" \ -H "Content-Type: application/json" \ @@ -72,46 +98,12 @@ curl -s "$PLANO_URL/routing/v1/chat/completions" \ "messages": [ {"role": "user", "content": "Write a quicksort implementation in Go"} ], - "routing_policy": [ + "routing_preferences": [ { - "model": "openai/gpt-4o", - "routing_preferences": [ - {"name": "coding", "description": "code generation, writing functions, debugging"} - ] - }, - { - "model": "openai/gpt-4o-mini", - "routing_preferences": [ - {"name": "general", "description": "general questions, simple lookups, casual conversation"} - ] - } - ] - }' | python3 -m json.tool -echo "" - -# --- Example 6: Inline routing policy with Anthropic format --- -echo "--- 6. Inline routing_policy (Anthropic format) ---" -echo "" -curl -s "$PLANO_URL/routing/v1/messages" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "max_tokens": 1024, - "messages": [ - {"role": "user", "content": "What is the weather like today?"} - ], - "routing_policy": [ - { - "model": "openai/gpt-4o", - "routing_preferences": [ - {"name": "coding", "description": "code generation, writing functions, debugging"} - ] - }, - { - "model": "openai/gpt-4o-mini", - "routing_preferences": [ - {"name": "general", "description": "general questions, simple lookups, casual conversation"} - ] + "name": "coding", + "description": "code generation, writing functions, debugging", + "models": ["anthropic/claude-sonnet-4-20250514", "openai/gpt-4o", "openai/gpt-4o-mini"], + "selection_policy": {"prefer": "fastest"} } ] }' | python3 -m json.tool diff --git a/demos/llm_routing/model_routing_service/metrics_server.py b/demos/llm_routing/model_routing_service/metrics_server.py index a7a22081..d236fe73 100644 --- a/demos/llm_routing/model_routing_service/metrics_server.py +++ b/demos/llm_routing/model_routing_service/metrics_server.py @@ -1,10 +1,14 @@ """ -Minimal Prometheus metrics server for demo purposes. -Exposes mock P95 latency data for model routing. +Demo metrics server. + +Exposes two endpoints: + GET /metrics — Prometheus text format, P95 latency per model (scraped by Prometheus) + GET /costs — JSON cost data per model, compatible with cost_metrics source """ +import json from http.server import HTTPServer, BaseHTTPRequestHandler -METRICS = """\ +PROMETHEUS_METRICS = """\ # HELP model_latency_p95_seconds P95 request latency in seconds per model # TYPE model_latency_p95_seconds gauge model_latency_p95_seconds{model_name="anthropic/claude-sonnet-4-20250514"} 0.85 @@ -12,13 +16,27 @@ model_latency_p95_seconds{model_name="openai/gpt-4o"} 1.20 model_latency_p95_seconds{model_name="openai/gpt-4o-mini"} 0.40 """.encode() +COST_DATA = { + "anthropic/claude-sonnet-4-20250514": {"input_per_million": 3.0, "output_per_million": 15.0}, + "openai/gpt-4o": {"input_per_million": 5.0, "output_per_million": 20.0}, + "openai/gpt-4o-mini": {"input_per_million": 0.15, "output_per_million": 0.6}, +} + class MetricsHandler(BaseHTTPRequestHandler): def do_GET(self): - self.send_response(200) - self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8") - self.end_headers() - self.wfile.write(METRICS) + if self.path == "/costs": + body = json.dumps(COST_DATA).encode() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(body) + else: + # /metrics and everything else → Prometheus format + self.send_response(200) + self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + self.end_headers() + self.wfile.write(PROMETHEUS_METRICS) def log_message(self, fmt, *args): pass # suppress access logs @@ -26,5 +44,5 @@ class MetricsHandler(BaseHTTPRequestHandler): if __name__ == "__main__": server = HTTPServer(("", 8080), MetricsHandler) - print("metrics server listening on :8080", flush=True) + print("metrics server listening on :8080 (/metrics, /costs)", flush=True) server.serve_forever()