From 5388c6777f5024bac6869d9124a759da9314f471 Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Mon, 16 Mar 2026 12:05:30 -0700 Subject: [PATCH] add k8s deployment manifests and docs for self-hosted Arch-Router (#831) --- .pre-commit-config.yaml | 1 + .../llm_routing/model_routing_service/DEMO.md | 341 ------------------ .../model_routing_service/README.md | 113 +++++- .../model_routing_service/config_k8s.yaml | 33 ++ .../plano-deployment.yaml | 68 ++++ .../model_routing_service/test.rest | 36 ++ .../vllm-deployment.yaml | 104 ++++++ docs/source/guides/llm_router.rst | 29 ++ 8 files changed, 383 insertions(+), 342 deletions(-) delete mode 100644 demos/llm_routing/model_routing_service/DEMO.md create mode 100644 demos/llm_routing/model_routing_service/config_k8s.yaml create mode 100644 demos/llm_routing/model_routing_service/plano-deployment.yaml create mode 100644 demos/llm_routing/model_routing_service/test.rest create mode 100644 demos/llm_routing/model_routing_service/vllm-deployment.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 84001c45..22a18416 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,7 @@ repos: hooks: - id: check-yaml exclude: config/envoy.template* + args: [--allow-multiple-documents] - id: end-of-file-fixer - id: trailing-whitespace - repo: local diff --git a/demos/llm_routing/model_routing_service/DEMO.md b/demos/llm_routing/model_routing_service/DEMO.md deleted file mode 100644 index a64604a8..00000000 --- a/demos/llm_routing/model_routing_service/DEMO.md +++ /dev/null @@ -1,341 +0,0 @@ -# Plano: Intelligent LLM Routing as Infrastructure - ---- - -## Plano - -An AI-native proxy and data plane for agentic apps — with built-in orchestration, safety, observability, and smart LLM routing so you stay focused on your agent's core logic. - -- **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover -- **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request -- **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code -- **Runs anywhere** — single binary, no dependencies; self-host the router for full data privacy - -``` -┌───────────┐ ┌─────────────────────────────────┐ ┌──────────────┐ -│ Client │ ──── │ Plano │ ──── │ OpenAI │ -│ (any │ │ │ │ Anthropic │ -│ language)│ │ Arch-Router (1.5B model) │ │ Any Provider│ -└───────────┘ │ analyzes intent → picks model │ └──────────────┘ - └─────────────────────────────────┘ -``` - ---- - -## Live Demo: Routing Decision Service - -The `/routing/v1/*` endpoints return **routing decisions without calling the LLM** — perfect for inspecting, testing, and validating routing behavior. - ---- - -### Demo 1 — Code Generation Request - -```bash -curl -s http://localhost:12000/routing/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "messages": [ - {"role": "user", "content": "Write a Python function that implements binary search"} - ] - }' -``` - -**Response:** -```json -{ - "model": "anthropic/claude-sonnet-4-20250514", - "route": "code_generation" -} -``` - -Plano recognized the coding intent and routed to Claude. - ---- - -### Demo 2 — Complex Reasoning Request - -```bash -curl -s http://localhost:12000/routing/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "messages": [ - {"role": "user", "content": "Explain the trade-offs between microservices and monolithic architectures"} - ] - }' -``` - -**Response:** -```json -{ - "model": "openai/gpt-4o", - "route": "complex_reasoning" -} -``` - -Same endpoint — Plano routed to GPT-4o for reasoning. - ---- - -### Demo 3 — Simple Question (No Match) - -```bash -curl -s http://localhost:12000/routing/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "messages": [ - {"role": "user", "content": "What is the capital of France?"} - ] - }' -``` - -**Response:** -```json -{ - "model": "none", - "route": "null" -} -``` - -No preference matched — falls back to the default (cheapest) model. - ---- - -### Demo 4 — Anthropic Messages Format - -```bash -curl -s http://localhost:12000/routing/v1/messages \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "max_tokens": 1024, - "messages": [ - {"role": "user", "content": "Create a REST API endpoint in Rust using actix-web that handles user registration"} - ] - }' -``` - -**Response:** -```json -{ - "model": "anthropic/claude-sonnet-4-20250514", - "route": "code_generation" -} -``` - -Same routing, Anthropic request format. - ---- - -### Demo 5 — OpenAI Responses API Format - -```bash -curl -s http://localhost:12000/routing/v1/responses \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4o-mini", - "input": "Build a React component that renders a sortable data table" - }' -``` - -**Response:** -```json -{ - "model": "anthropic/claude-sonnet-4-20250514", - "route": "code_generation" -} -``` - -Same routing engine, works with the OpenAI Responses API format too. - ---- - -## How Did That Work? - -10 lines of YAML. No code. - -```yaml -model_providers: - - - model: openai/gpt-4o-mini - default: true # fallback for unmatched requests - - - model: openai/gpt-4o - routing_preferences: - - name: complex_reasoning - description: complex reasoning tasks, multi-step analysis - - - model: anthropic/claude-sonnet-4-20250514 - routing_preferences: - - name: code_generation - description: generating new code, writing functions -``` - -That's the entire routing configuration. - ---- - -## Under the Hood: How Routing Preferences Work - -### Writing Good Preferences - -Each `routing_preference` has two fields: - -| Field | Purpose | Example | -|---|---|---| -| `name` | Route identifier (returned in responses) | `code_generation` | -| `description` | Natural language — tells the router **when** to pick this model | `generating new code, writing functions, or creating boilerplate` | - -The `description` is the key lever. Write it like you're explaining to a colleague when to use this model: - -```yaml -# Good — specific, descriptive -routing_preferences: - - name: code_generation - description: generating new code snippets, writing functions, creating boilerplate, or refactoring existing code - -# Too vague — overlaps with everything -routing_preferences: - - name: code - description: anything related to code -``` - -Tips: -- **Be specific** — "multi-step mathematical proofs and formal logic" beats "hard questions" -- **Describe the task, not the model** — focus on what the user is asking for -- **Avoid overlap** — if two preferences match the same request, the router has to guess -- **One model can have multiple preferences** — good at both code and math? List both - ---- - -### How Arch-Router Uses Them - -When a request arrives, Plano constructs a prompt for the 1.5B Arch-Router model: - -```xml -You are a helpful assistant designed to find the best suited route. - - -[ - {"name": "complex_reasoning", "description": "complex reasoning tasks, multi-step analysis"}, - {"name": "code_generation", "description": "generating new code, writing functions"} -] - - - -[{"role": "user", "content": "Write a Python function that implements binary search"}] - - -Your task is to decide which route best suits the user intent... -``` - -The router classifies the intent and responds: -```json -{"route": "code_generation"} -``` - -Plano maps `code_generation` back to the model that owns it → `anthropic/claude-sonnet-4-20250514`. - ---- - -### The Full Flow - -``` -1. Request arrives → "Write binary search in Python" -2. Preferences serialized → [{"name":"code_generation", ...}, {"name":"complex_reasoning", ...}] -3. Arch-Router classifies → {"route": "code_generation"} -4. Route → Model lookup → code_generation → anthropic/claude-sonnet-4-20250514 -5. Request forwarded → Claude generates the response -``` - -No match? Arch-Router returns `{"route": "other"}` → Plano falls back to the default model. - ---- - -### What Powers the Routing - -**Arch-Router** — a purpose-built 1.5B parameter model for intent classification. - -- Runs locally (Ollama) or hosted — no data leaves your network -- Sub-100ms routing decisions -- Handles multi-turn conversations (automatically truncates to fit context) -- Based on preference-aligned routing research - ---- - -## Multi-Format Support - -Same routing engine, any API format: - -| Endpoint | Format | -|---|---| -| `/routing/v1/chat/completions` | OpenAI Chat Completions | -| `/routing/v1/messages` | Anthropic Messages | -| `/routing/v1/responses` | OpenAI Responses API | - ---- - -## Inline Routing Policy - -Clients can override routing at request time — no config change needed: - -```json -{ - "model": "gpt-4o-mini", - "messages": [{"role": "user", "content": "Write quicksort in Go"}], - "routing_policy": [ - { - "model": "openai/gpt-4o", - "routing_preferences": [ - {"name": "coding", "description": "code generation and debugging"} - ] - }, - { - "model": "openai/gpt-4o-mini", - "routing_preferences": [ - {"name": "general", "description": "simple questions and conversation"} - ] - } - ] -} -``` - -Platform sets defaults. Teams override when needed. - ---- - -## Beyond Routing - -Plano is a full AI data plane: - -- **Guardrails** — prompt/response filtering, PII detection -- **Observability** — OpenTelemetry tracing, per-request metrics -- **Rate Limiting** — token-aware rate limiting per model -- **Multi-Provider** — OpenAI, Anthropic, Azure, Gemini, Groq, DeepSeek, Ollama, and more -- **Model Aliases** — `arch.fast.v1` → `gpt-4o-mini` (swap providers without client changes) - ---- - -## Key Takeaways - -1. **No SDK required** — standard API, any language, any framework -2. **Semantic routing** — plain English preferences, not hand-coded rules -3. **Self-hosted router** — 1.5B model runs locally, no data leaves the network -4. **Inspect before you route** — decision-only endpoints for testing and CI/CD -5. **Platform governance** — centralized keys, aliases, and routing policies - ---- - -## Try It - -```bash -pip install planoai -export OPENAI_API_KEY=... -export ANTHROPIC_API_KEY=... -plano up -f config.yaml -bash demo.sh -``` - -**GitHub:** github.com/katanemo/plano diff --git a/demos/llm_routing/model_routing_service/README.md b/demos/llm_routing/model_routing_service/README.md index 85d56abf..72b672f3 100644 --- a/demos/llm_routing/model_routing_service/README.md +++ b/demos/llm_routing/model_routing_service/README.md @@ -1,6 +1,54 @@ # Model Routing Service Demo -This demo shows how to use the `/routing/v1/*` endpoints to get routing decisions without proxying requests to an LLM. The endpoint accepts standard LLM request formats and returns which model Plano's router would select. +Plano is an AI-native proxy and data plane for agentic apps — with built-in orchestration, safety, observability, and intelligent LLM routing. + +``` +┌───────────┐ ┌─────────────────────────────────┐ ┌──────────────┐ +│ Client │ ───► │ Plano │ ───► │ OpenAI │ +│ (any │ │ │ │ Anthropic │ +│ language)│ │ Arch-Router (1.5B model) │ │ Any Provider│ +└───────────┘ │ analyzes intent → picks model │ └──────────────┘ + └─────────────────────────────────┘ +``` + +- **One endpoint, many models** — apps call Plano using standard OpenAI/Anthropic APIs; Plano handles provider selection, keys, and failover +- **Intelligent routing** — a lightweight 1.5B router model classifies user intent and picks the best model per request +- **Platform governance** — centralize API keys, rate limits, guardrails, and observability without touching app code +- **Runs anywhere** — single binary; self-host the router for full data privacy + +## How Routing Works + +The entire routing configuration is plain YAML — no code: + +```yaml +model_providers: + - model: openai/gpt-4o-mini + default: true # fallback for unmatched requests + + - model: openai/gpt-4o + routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis + + - model: anthropic/claude-sonnet-4-20250514 + routing_preferences: + - name: code_generation + description: generating new code, writing functions +``` + +When a request arrives, Plano sends the conversation and routing preferences to Arch-Router, which classifies the intent and returns the matching route: + +``` +1. Request arrives → "Write binary search in Python" +2. Preferences serialized → [{"name":"code_generation", ...}, {"name":"complex_reasoning", ...}] +3. Arch-Router classifies → {"route": "code_generation"} +4. Route → Model lookup → code_generation → anthropic/claude-sonnet-4-20250514 +5. Request forwarded → Claude generates the response +``` + +No match? Arch-Router returns `other` → Plano falls back to the default model. + +The `/routing/v1/*` endpoints return the routing decision **without** forwarding to the LLM — useful for testing and validating routing behavior before going to production. ## Setup @@ -55,6 +103,69 @@ Response: The response tells you which model would handle this request and which route was matched, without actually making the LLM call. +## Kubernetes Deployment (Self-hosted Arch-Router on GPU) + +To run Arch-Router in-cluster using vLLM instead of the default hosted endpoint: + +**0. Check your GPU node labels and taints** + +```bash +kubectl get nodes --show-labels | grep -i gpu +kubectl get node -o jsonpath='{.spec.taints}' +``` + +GPU nodes commonly have a `nvidia.com/gpu:NoSchedule` taint — `vllm-deployment.yaml` includes a matching toleration. If you have multiple GPU node pools and need to pin to a specific one, uncomment and set the `nodeSelector` in `vllm-deployment.yaml` using the label for your cloud provider. + +**1. Deploy Arch-Router and Plano:** + +```bash + +# arch-router deployment +kubectl apply -f vllm-deployment.yaml + +# plano deployment +kubectl create secret generic plano-secrets \ + --from-literal=OPENAI_API_KEY=$OPENAI_API_KEY \ + --from-literal=ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY + +kubectl create configmap plano-config \ + --from-file=plano_config.yaml=config_k8s.yaml \ + --dry-run=client -o yaml | kubectl apply -f - + +kubectl apply -f plano-deployment.yaml +``` + +**3. Wait for both pods to be ready:** + +```bash +# Arch-Router downloads the model (~1 min) then vLLM loads it (~2 min) +kubectl get pods -l app=arch-router -w +kubectl rollout status deployment/plano +``` + +**4. Test:** + +```bash +kubectl port-forward svc/plano 12000:12000 +./demo.sh +``` + +To confirm requests are hitting your in-cluster Arch-Router (not just health checks): + +```bash +kubectl logs -l app=arch-router -f --tail=0 +# Look for POST /v1/chat/completions entries +``` + +**Updating the config:** + +```bash +kubectl create configmap plano-config \ + --from-file=plano_config.yaml=config_k8s.yaml \ + --dry-run=client -o yaml | kubectl apply -f - +kubectl rollout restart deployment/plano +``` + ## Demo Output ``` diff --git a/demos/llm_routing/model_routing_service/config_k8s.yaml b/demos/llm_routing/model_routing_service/config_k8s.yaml new file mode 100644 index 00000000..bdf98bfa --- /dev/null +++ b/demos/llm_routing/model_routing_service/config_k8s.yaml @@ -0,0 +1,33 @@ +version: v0.3.0 + +overrides: + llm_routing_model: plano/Arch-Router + +listeners: + - type: model + name: model_listener + port: 12000 + +model_providers: + + - model: plano/Arch-Router + base_url: http://arch-router:10000 + + - model: openai/gpt-4o-mini + access_key: $OPENAI_API_KEY + default: true + + - model: openai/gpt-4o + access_key: $OPENAI_API_KEY + routing_preferences: + - name: complex_reasoning + description: complex reasoning tasks, multi-step analysis, or detailed explanations + + - model: anthropic/claude-sonnet-4-20250514 + access_key: $ANTHROPIC_API_KEY + routing_preferences: + - name: code_generation + description: generating new code, writing functions, or creating boilerplate + +tracing: + random_sampling: 100 diff --git a/demos/llm_routing/model_routing_service/plano-deployment.yaml b/demos/llm_routing/model_routing_service/plano-deployment.yaml new file mode 100644 index 00000000..e093f404 --- /dev/null +++ b/demos/llm_routing/model_routing_service/plano-deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: plano + labels: + app: plano +spec: + replicas: 1 + selector: + matchLabels: + app: plano + template: + metadata: + labels: + app: plano + spec: + containers: + - name: plano + image: katanemo/plano:0.4.12 + ports: + - containerPort: 12000 # LLM gateway (chat completions, model routing) + name: llm-gateway + envFrom: + - secretRef: + name: plano-secrets + env: + - name: LOG_LEVEL + value: "info" + volumeMounts: + - name: plano-config + mountPath: /app/plano_config.yaml + subPath: plano_config.yaml + readOnly: true + readinessProbe: + httpGet: + path: /healthz + port: 12000 + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /healthz + port: 12000 + initialDelaySeconds: 10 + periodSeconds: 30 + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "1000m" + volumes: + - name: plano-config + configMap: + name: plano-config +--- +apiVersion: v1 +kind: Service +metadata: + name: plano +spec: + selector: + app: plano + ports: + - name: llm-gateway + port: 12000 + targetPort: 12000 diff --git a/demos/llm_routing/model_routing_service/test.rest b/demos/llm_routing/model_routing_service/test.rest new file mode 100644 index 00000000..b41d75f2 --- /dev/null +++ b/demos/llm_routing/model_routing_service/test.rest @@ -0,0 +1,36 @@ +### Code generation query (OpenAI format) — expects anthropic/claude-sonnet +POST http://localhost:12000/routing/v1/chat/completions +Content-Type: application/json + +{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Write a Python function for binary search"}] +} + +### Complex reasoning query (OpenAI format) — expects openai/gpt-4o +POST http://localhost:12000/routing/v1/chat/completions +Content-Type: application/json + +{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Analyze the trade-offs between microservices and monolithic architecture"}] +} + +### Simple query — no routing match, expects default model +POST http://localhost:12000/routing/v1/chat/completions +Content-Type: application/json + +{ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": "Hello"}] +} + +### Code generation query (Anthropic format) +POST http://localhost:12000/routing/v1/messages +Content-Type: application/json + +{ + "model": "claude-sonnet-4-20250514", + "max_tokens": 1024, + "messages": [{"role": "user", "content": "Write a REST API in Go using Gin"}] +} diff --git a/demos/llm_routing/model_routing_service/vllm-deployment.yaml b/demos/llm_routing/model_routing_service/vllm-deployment.yaml new file mode 100644 index 00000000..1debe15e --- /dev/null +++ b/demos/llm_routing/model_routing_service/vllm-deployment.yaml @@ -0,0 +1,104 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: arch-router + labels: + app: arch-router +spec: + replicas: 1 + selector: + matchLabels: + app: arch-router + template: + metadata: + labels: + app: arch-router + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # Optional: add a nodeSelector to pin to a specific GPU node pool. + # The nvidia.com/gpu resource request below is sufficient for most clusters. + # nodeSelector: + # DigitalOcean: doks.digitalocean.com/gpu-model: l40s + # GKE: cloud.google.com/gke-accelerator: nvidia-l4 + # EKS: eks.amazonaws.com/nodegroup: gpu-nodes + # AKS: kubernetes.azure.com/agentpool: gpupool + initContainers: + - name: download-model + image: python:3.11-slim + command: + - sh + - -c + - | + pip install huggingface_hub[cli] && \ + python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')" + volumeMounts: + - name: model-cache + mountPath: /models + containers: + - name: vllm + image: vllm/vllm-openai:latest + command: + - vllm + - serve + - /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf + - "--host" + - "0.0.0.0" + - "--port" + - "10000" + - "--load-format" + - "gguf" + - "--tokenizer" + - "katanemo/Arch-Router-1.5B" + - "--served-model-name" + - "Arch-Router" + - "--gpu-memory-utilization" + - "0.3" + - "--tensor-parallel-size" + - "1" + - "--enable-prefix-caching" + ports: + - name: http + containerPort: 10000 + protocol: TCP + resources: + requests: + cpu: "1" + memory: "4Gi" + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: "8Gi" + nvidia.com/gpu: "1" + volumeMounts: + - name: model-cache + mountPath: /models + readinessProbe: + httpGet: + path: /health + port: 10000 + initialDelaySeconds: 60 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /health + port: 10000 + initialDelaySeconds: 180 + periodSeconds: 30 + volumes: + - name: model-cache + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: arch-router +spec: + selector: + app: arch-router + ports: + - name: http + port: 10000 + targetPort: 10000 diff --git a/docs/source/guides/llm_router.rst b/docs/source/guides/llm_router.rst index 0073a664..7c4ad685 100644 --- a/docs/source/guides/llm_router.rst +++ b/docs/source/guides/llm_router.rst @@ -347,6 +347,35 @@ vLLM provides higher throughput and GPU optimizations suitable for production de curl http://localhost:10000/v1/models +Using vLLM on Kubernetes (GPU nodes) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For teams running Kubernetes, Arch-Router and Plano can be deployed as in-cluster services. +The ``demos/llm_routing/model_routing_service/`` directory includes ready-to-use manifests: + +- ``vllm-deployment.yaml`` — Arch-Router served by vLLM, with an init container to download + the model from HuggingFace +- ``plano-deployment.yaml`` — Plano proxy configured to use the in-cluster Arch-Router +- ``config_k8s.yaml`` — Plano config with ``llm_routing_model`` pointing at + ``http://arch-router:10000`` instead of the default hosted endpoint + +Key things to know before deploying: + +- GPU nodes commonly have a ``nvidia.com/gpu:NoSchedule`` taint — the ``vllm-deployment.yaml`` + includes a matching toleration. The ``nvidia.com/gpu: "1"`` resource request is sufficient + for scheduling in most clusters; a ``nodeSelector`` is optional and commented out in the + manifest for cases where you need to pin to a specific GPU node pool. +- Model download takes ~1 minute; vLLM loads the model in ~1-2 minutes after that. The + ``livenessProbe`` has a 180-second ``initialDelaySeconds`` to avoid premature restarts. +- The Plano config ConfigMap must use ``--from-file=plano_config.yaml=config_k8s.yaml`` with + ``subPath`` in the Deployment — omitting ``subPath`` causes Kubernetes to mount a directory + instead of a file. + +For the canonical Plano Kubernetes deployment (ConfigMap, Secrets, Deployment YAML), see +:ref:`deployment`. For full step-by-step commands specific to this demo, see the +`demo README `_. + + Combining Routing Methods -------------------------