model routing: cost/latency ranking with ranked fallback list (#849)

2026-05-02 04:12:56 +02:00 · 2026-03-30 13:46:52 -07:00 · 2026-03-30 13:46:52 -07:00 · e5751d6b13
commit e5751d6b13
parent 3a531ce22a
23 changed files with 1524 additions and 317 deletions
--- a/demos/llm_routing/model_routing_service/config.yaml
+++ b/demos/llm_routing/model_routing_service/config.yaml
@ -1,4 +1,4 @@
-version: v0.3.0
+version: v0.4.0

 listeners:
  - type: model
@ -6,22 +6,48 @@ listeners:
    port: 12000

 model_providers:
-
  - model: openai/gpt-4o-mini
    access_key: $OPENAI_API_KEY
    default: true

  - model: openai/gpt-4o
    access_key: $OPENAI_API_KEY
-    routing_preferences:
-      - name: complex_reasoning
-        description: complex reasoning tasks, multi-step analysis, or detailed explanations

  - model: anthropic/claude-sonnet-4-20250514
    access_key: $ANTHROPIC_API_KEY
-    routing_preferences:
-      - name: code_generation
-        description: generating new code, writing functions, or creating boilerplate

-tracing:
-  random_sampling: 100
+routing_preferences:
+  - name: complex_reasoning
+    description: complex reasoning tasks, multi-step analysis, or detailed explanations
+    models:
+      - openai/gpt-4o
+      - openai/gpt-4o-mini
+    selection_policy:
+      prefer: cheapest
+
+  - name: code_generation
+    description: generating new code, writing functions, or creating boilerplate
+    models:
+      - anthropic/claude-sonnet-4-20250514
+      - openai/gpt-4o
+    selection_policy:
+      prefer: fastest
+
+model_metrics_sources:
+  - type: digitalocean_pricing
+    refresh_interval: 3600
+    model_aliases:
+      openai-gpt-4o: openai/gpt-4o
+      openai-gpt-4o-mini: openai/gpt-4o-mini
+      anthropic-claude-sonnet-4: anthropic/claude-sonnet-4-20250514
+
+  # Use cost_metrics instead of digitalocean_pricing to supply your own pricing data.
+  # The demo metrics_server.py exposes /costs with OpenAI and Anthropic pricing.
+  # - type: cost_metrics
+  #   url: http://localhost:8080/costs
+  #   refresh_interval: 300
+
+  - type: prometheus_metrics
+    url: http://localhost:9090
+    query: model_latency_p95_seconds
+    refresh_interval: 60