resolve merge conflict in main.rs

2026-05-21 13:55:15 +02:00 · 2026-03-16 12:40:33 -07:00 · 2026-03-16 12:40:33 -07:00 · 80dfb41cad
commit 80dfb41cad
parent 6fe7613bcd 5388c6777f
40 changed files with 920 additions and 301 deletions
--- a/demos/agent_orchestration/travel_agents/README.md
+++ b/demos/agent_orchestration/travel_agents/README.md
@ -123,6 +123,42 @@ Each agent:

 Both agents run as native local processes and communicate with Plano running natively on the host.

+## Running with local Plano-Orchestrator (via vLLM)
+
+By default, Plano uses a hosted Plano-Orchestrator endpoint. To self-host the orchestrator model locally using vLLM on a server with an NVIDIA GPU:
+
+1. Install vLLM and download the model:
+```bash
+pip install vllm
+```
+
+2. Start the vLLM server with the 4B model:
+```bash
+vllm serve katanemo/Plano-Orchestrator-4B \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.3 \
+    --tokenizer katanemo/Plano-Orchestrator-4B \
+    --chat-template chat_template.jinja \
+    --served-model-name katanemo/Plano-Orchestrator-4B \
+    --enable-prefix-caching
+```
+
+3. Start the demo with the local orchestrator config:
+```bash
+./run_demo.sh --local-orchestrator
+```
+
+4. Test with curl:
+```bash
+curl -X POST http://localhost:8001/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "gpt-5.2", "messages": [{"role": "user", "content": "What is the weather in Istanbul?"}]}'
+```
+
+You should see Plano use your local orchestrator to route the request to the weather agent.
+
 ## Observability

 This demo includes full OpenTelemetry (OTel) compatible distributed tracing to monitor and debug agent interactions:
--- a/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml
+++ b/demos/agent_orchestration/travel_agents/config_local_orchestrator.yaml
@ -0,0 +1,66 @@
+version: v0.3.0
+
+overrides:
+  agent_orchestration_model: plano/katanemo/Plano-Orchestrator-4B
+
+agents:
+  - id: weather_agent
+    url: http://localhost:10510
+  - id: flight_agent
+    url: http://localhost:10520
+
+model_providers:
+  - model: plano/katanemo/Plano-Orchestrator-4B
+    base_url: http://localhost:8000
+
+  - model: openai/gpt-5.2
+    access_key: $OPENAI_API_KEY
+    default: true
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY # smaller, faster, cheaper model for extracting entities like location
+
+listeners:
+  - type: agent
+    name: travel_booking_service
+    port: 8001
+    router: plano_orchestrator_v1
+    agents:
+      - id: weather_agent
+        description: |
+
+          WeatherAgent is a specialized AI assistant for real-time weather information and forecasts. It provides accurate weather data for any city worldwide using the Open-Meteo API, helping travelers plan their trips with up-to-date weather conditions.
+
+          Capabilities:
+            * Get real-time weather conditions and multi-day forecasts for any city worldwide using Open-Meteo API (free, no API key needed)
+            * Provides current temperature
+            * Provides multi-day forecasts
+            * Provides weather conditions
+            * Provides sunrise/sunset times
+            * Provides detailed weather information
+            * Understands conversation context to resolve location references from previous messages
+            * Handles weather-related questions including "What's the weather in [city]?", "What's the forecast for [city]?", "How's the weather in [city]?"
+            * When queries include both weather and other travel questions (e.g., flights, currency), this agent answers ONLY the weather part
+
+      - id: flight_agent
+        description: |
+
+          FlightAgent is an AI-powered tool specialized in providing live flight information between airports. It leverages the FlightAware AeroAPI to deliver real-time flight status, gate information, and delay updates.
+
+          Capabilities:
+            * Get live flight information between airports using FlightAware AeroAPI
+            * Shows real-time flight status
+            * Shows scheduled/estimated/actual departure and arrival times
+            * Shows gate and terminal information
+            * Shows delays
+            * Shows aircraft type
+            * Shows flight status
+            * Automatically resolves city names to airport codes (IATA/ICAO)
+            * Understands conversation context to infer origin/destination from follow-up questions
+            * Handles flight-related questions including "What flights go from [city] to [city]?", "Do flights go to [city]?", "Are there direct flights from [city]?"
+            * When queries include both flight and other travel questions (e.g., weather, currency), this agent answers ONLY the flight part
+
+tracing:
+  random_sampling: 100
+  span_attributes:
+    header_prefixes:
+      - x-acme-
--- a/demos/agent_orchestration/travel_agents/run_demo.sh
+++ b/demos/agent_orchestration/travel_agents/run_demo.sh
@ -31,8 +31,13 @@ start_demo() {
  fi

  # Step 4: Start Plano
-  echo "Starting Plano with config.yaml..."
-  planoai up config.yaml
+  PLANO_CONFIG="config.yaml"
+  if [ "$1" == "--local-orchestrator" ]; then
+    PLANO_CONFIG="config_local_orchestrator.yaml"
+    echo "Using local orchestrator config..."
+  fi
+  echo "Starting Plano with $PLANO_CONFIG..."
+  planoai up "$PLANO_CONFIG"

  # Step 5: Start agents natively
  echo "Starting agents..."
--- a/demos/llm_routing/model_routing_service/README.md
+++ b/demos/llm_routing/model_routing_service/README.md
@ -1,6 +1,54 @@
 # Model Routing Service Demo

-This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select.
+Plano is an AI-native proxy and data plane for agentic apps — with built-in orchestration, safety, observability, and intelligent LLM routing.
+
+```
+┌───────────┐      ┌─────────────────────────────────┐      ┌──────────────┐
+│  Client   │ ───► │  Plano                          │ ───► │  OpenAI      │
+│  (any     │      │                                 │      │  Anthropic   │
+│  language)│      │  Arch-Router (1.5B model)       │      │  Any Provider│
+└───────────┘      │  analyzes intent → picks model  │      └──────────────┘
+                   └─────────────────────────────────┘
+```
+
+- **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover
+- **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request
+- **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code
+- **Runs anywhere** — single binary; self-host the router for full data privacy
+
+## How Routing Works
+
+The entire routing configuration is plain YAML — no code:
+
+```yaml
+model_providers:
+  - model: openai/gpt-4o-mini
+    default: true                    # fallback for unmatched requests
+
+  - model: openai/gpt-4o
+    routing_preferences:
+      - name: complex_reasoning
+        description: complex reasoning tasks, multi-step analysis
+
+  - model: anthropic/claude-sonnet-4-20250514
+    routing_preferences:
+      - name: code_generation
+        description: generating new code, writing functions
+```
+
+When a request arrives, Plano sends the conversation and routing preferences to Arch-Router, which classifies the intent and returns the matching route:
+
+```
+1. Request arrives          → "Write binary search in Python"
+2. Preferences serialized   → [{"name":"code_generation", ...}, {"name":"complex_reasoning", ...}]
+3. Arch-Router classifies   → {"route": "code_generation"}
+4. Route → Model lookup     → code_generation → anthropic/claude-sonnet-4-20250514
+5. Request forwarded        → Claude generates the response
+```
+
+No match? Arch-Router returns `other` → Plano falls back to the default model.
+
+The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing and validating routing behavior before going to production.

 ## Setup

@ -55,6 +103,69 @@ Response:

 The response tells you which model would handle this request and which route was matched, without actually making the LLM call.

+## Kubernetes Deployment (Self-hosted Arch-Router on GPU)
+
+To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint:
+
+**0. Check your GPU node labels and taints**
+
+```bash
+kubectl get nodes --show-labels | grep -i gpu
+kubectl get node <gpu-node-name> -o jsonpath='{.spec.taints}'
+```
+
+GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment.yaml` includes a matching toleration. If you have multiple GPU node pools and need to pin to a specific one, uncomment and set the `nodeSelector` in `vllm-deployment.yaml` using the label for your cloud provider.
+
+**1. Deploy Arch-Router and Plano:**
+
+```bash
+
+# arch-router deployment
+kubectl apply -f vllm-deployment.yaml
+
+# plano deployment
+kubectl create secret generic plano-secrets \
+  --from-literal=OPENAI_API_KEY=$OPENAI_API_KEY \
+  --from-literal=ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY
+
+kubectl create configmap plano-config \
+  --from-file=plano_config.yaml=config_k8s.yaml \
+  --dry-run=client -o yaml | kubectl apply -f -
+
+kubectl apply -f plano-deployment.yaml
+```
+
+**3. Wait for both pods to be ready:**
+
+```bash
+# Arch-Router downloads the model (~1 min) then vLLM loads it (~2 min)
+kubectl get pods -l app=arch-router -w
+kubectl rollout status deployment/plano
+```
+
+**4. Test:**
+
+```bash
+kubectl port-forward svc/plano 12000:12000
+./demo.sh
+```
+
+To confirm requests are hitting your in-cluster Arch-Router (not just health checks):
+
+```bash
+kubectl logs -l app=arch-router -f --tail=0
+# Look for POST /v1/chat/completions entries
+```
+
+**Updating the config:**
+
+```bash
+kubectl create configmap plano-config \
+  --from-file=plano_config.yaml=config_k8s.yaml \
+  --dry-run=client -o yaml | kubectl apply -f -
+kubectl rollout restart deployment/plano
+```
+
 ## Demo Output

 ```
--- a/demos/llm_routing/model_routing_service/config_k8s.yaml
+++ b/demos/llm_routing/model_routing_service/config_k8s.yaml
@ -0,0 +1,33 @@
+version: v0.3.0
+
+overrides:
+  llm_routing_model: plano/Arch-Router
+
+listeners:
+  - type: model
+    name: model_listener
+    port: 12000
+
+model_providers:
+
+  - model: plano/Arch-Router
+    base_url: http://arch-router:10000
+
+  - model: openai/gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    default: true
+
+  - model: openai/gpt-4o
+    access_key: $OPENAI_API_KEY
+    routing_preferences:
+      - name: complex_reasoning
+        description: complex reasoning tasks, multi-step analysis, or detailed explanations
+
+  - model: anthropic/claude-sonnet-4-20250514
+    access_key: $ANTHROPIC_API_KEY
+    routing_preferences:
+      - name: code_generation
+        description: generating new code, writing functions, or creating boilerplate
+
+tracing:
+  random_sampling: 100
--- a/demos/llm_routing/model_routing_service/plano-deployment.yaml
+++ b/demos/llm_routing/model_routing_service/plano-deployment.yaml
@ -0,0 +1,68 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: plano
+  labels:
+    app: plano
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: plano
+  template:
+    metadata:
+      labels:
+        app: plano
+    spec:
+      containers:
+        - name: plano
+          image: katanemo/plano:0.4.12
+          ports:
+            - containerPort: 12000  # LLM gateway (chat completions, model routing)
+              name: llm-gateway
+          envFrom:
+            - secretRef:
+                name: plano-secrets
+          env:
+            - name: LOG_LEVEL
+              value: "info"
+          volumeMounts:
+            - name: plano-config
+              mountPath: /app/plano_config.yaml
+              subPath: plano_config.yaml
+              readOnly: true
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 12000
+            initialDelaySeconds: 5
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 12000
+            initialDelaySeconds: 10
+            periodSeconds: 30
+          resources:
+            requests:
+              memory: "256Mi"
+              cpu: "250m"
+            limits:
+              memory: "512Mi"
+              cpu: "1000m"
+      volumes:
+        - name: plano-config
+          configMap:
+            name: plano-config
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: plano
+spec:
+  selector:
+    app: plano
+  ports:
+    - name: llm-gateway
+      port: 12000
+      targetPort: 12000
--- a/demos/llm_routing/model_routing_service/test.rest
+++ b/demos/llm_routing/model_routing_service/test.rest
@ -0,0 +1,36 @@
+### Code generation query (OpenAI format) — expects anthropic/claude-sonnet
+POST http://localhost:12000/routing/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "gpt-4o-mini",
+  "messages": [{"role": "user", "content": "Write a Python function for binary search"}]
+}
+
+### Complex reasoning query (OpenAI format) — expects openai/gpt-4o
+POST http://localhost:12000/routing/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "gpt-4o-mini",
+  "messages": [{"role": "user", "content": "Analyze the trade-offs between microservices and monolithic architecture"}]
+}
+
+### Simple query — no routing match, expects default model
+POST http://localhost:12000/routing/v1/chat/completions
+Content-Type: application/json
+
+{
+  "model": "gpt-4o-mini",
+  "messages": [{"role": "user", "content": "Hello"}]
+}
+
+### Code generation query (Anthropic format)
+POST http://localhost:12000/routing/v1/messages
+Content-Type: application/json
+
+{
+  "model": "claude-sonnet-4-20250514",
+  "max_tokens": 1024,
+  "messages": [{"role": "user", "content": "Write a REST API in Go using Gin"}]
+}
--- a/demos/llm_routing/model_routing_service/vllm-deployment.yaml
+++ b/demos/llm_routing/model_routing_service/vllm-deployment.yaml
@ -0,0 +1,104 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: arch-router
+  labels:
+    app: arch-router
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: arch-router
+  template:
+    metadata:
+      labels:
+        app: arch-router
+    spec:
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      # Optional: add a nodeSelector to pin to a specific GPU node pool.
+      # The nvidia.com/gpu resource request below is sufficient for most clusters.
+      # nodeSelector:
+      #   DigitalOcean: doks.digitalocean.com/gpu-model: l40s
+      #   GKE:          cloud.google.com/gke-accelerator: nvidia-l4
+      #   EKS:          eks.amazonaws.com/nodegroup: gpu-nodes
+      #   AKS:          kubernetes.azure.com/agentpool: gpupool
+      initContainers:
+        - name: download-model
+          image: python:3.11-slim
+          command:
+            - sh
+            - -c
+            - |
+              pip install huggingface_hub[cli] && \
+              python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')"
+          volumeMounts:
+            - name: model-cache
+              mountPath: /models
+      containers:
+        - name: vllm
+          image: vllm/vllm-openai:latest
+          command:
+            - vllm
+            - serve
+            - /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - "10000"
+            - "--load-format"
+            - "gguf"
+            - "--tokenizer"
+            - "katanemo/Arch-Router-1.5B"
+            - "--served-model-name"
+            - "Arch-Router"
+            - "--gpu-memory-utilization"
+            - "0.3"
+            - "--tensor-parallel-size"
+            - "1"
+            - "--enable-prefix-caching"
+          ports:
+            - name: http
+              containerPort: 10000
+              protocol: TCP
+          resources:
+            requests:
+              cpu: "1"
+              memory: "4Gi"
+              nvidia.com/gpu: "1"
+            limits:
+              cpu: "4"
+              memory: "8Gi"
+              nvidia.com/gpu: "1"
+          volumeMounts:
+            - name: model-cache
+              mountPath: /models
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 10000
+            initialDelaySeconds: 60
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 10000
+            initialDelaySeconds: 180
+            periodSeconds: 30
+      volumes:
+        - name: model-cache
+          emptyDir: {}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: arch-router
+spec:
+  selector:
+    app: arch-router
+  ports:
+    - name: http
+      port: 10000
+      targetPort: 10000
--- a/demos/llm_routing/openclaw_routing/config.yaml
+++ b/demos/llm_routing/openclaw_routing/config.yaml
@ -1,8 +1,7 @@
 version: v0.1.0

-routing:
-  model: Arch-Router
-  llm_provider: arch-router
+overrides:
+  llm_routing_model: Arch-Router

 listeners:
  egress_traffic:
--- a/demos/llm_routing/preference_based_routing/plano_config_local.yaml
+++ b/demos/llm_routing/preference_based_routing/plano_config_local.yaml
@ -1,8 +1,7 @@
 version: v0.3.0

-routing:
-  model: Arch-Router
-  llm_provider: arch-router
+overrides:
+  llm_routing_model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M

 listeners:
  - type: model
@ -11,8 +10,7 @@ listeners:

 model_providers:

-  - name: arch-router
-    model: arch/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
+  - model: plano/hf.co/katanemo/Arch-Router-1.5B.gguf:Q4_K_M
    base_url: http://localhost:11434

  - model: openai/gpt-4o-mini