use plano-orchestrator for LLM routing, remove arch-router

Replace RouterService/RouterModelV1 (arch-router prompt) with OrchestratorService/OrchestratorModelV1 (plano-orchestrator prompt) for LLM routing. This ensures the correct system prompt is used when llm_routing_model points at a Plano-Orchestrator model. - Extend OrchestratorService with session caching, ModelMetricsService, top-level routing preferences, and determine_route() for LLM routing - Delete RouterService, RouterModel trait, RouterModelV1, and ARCH_ROUTER_V1_SYSTEM_PROMPT - Unify defaults to Plano-Orchestrator / plano-orchestrator - Update CLI config generator, demos, docs, and config schema Made-with: Cursor
2026-06-02 14:35:14 +02:00 · 2026-04-15 13:11:17 -07:00 · 2026-04-15 13:11:17 -07:00 · af724fcc1e
commit af724fcc1e
parent 980faef6be
27 changed files with 380 additions and 1412 deletions
--- a/demos/llm_routing/model_routing_service/vllm-deployment.yaml
+++ b/demos/llm_routing/model_routing_service/vllm-deployment.yaml
@ -1,18 +1,18 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: arch-router
+  name: plano-orchestrator
  labels:
-    app: arch-router
+    app: plano-orchestrator
 spec:
  replicas: 1
  selector:
    matchLabels:
-      app: arch-router
+      app: plano-orchestrator
  template:
    metadata:
      labels:
-        app: arch-router
+        app: plano-orchestrator
    spec:
      tolerations:
        - key: nvidia.com/gpu
@ -53,7 +53,7 @@ spec:
            - "--tokenizer"
            - "katanemo/Arch-Router-1.5B"
            - "--served-model-name"
-            - "Arch-Router"
+            - "Plano-Orchestrator"
            - "--gpu-memory-utilization"
            - "0.3"
            - "--tensor-parallel-size"
@ -94,10 +94,10 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: arch-router
+  name: plano-orchestrator
 spec:
  selector:
-    app: arch-router
+    app: plano-orchestrator
  ports:
    - name: http
      port: 10000