plano/demos/llm_routing/model_routing_service/vllm-deployment.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: plano-orchestrator
  labels:
    app: plano-orchestrator
spec:
  replicas: 1
  selector:
    matchLabels:
      app: plano-orchestrator
  template:
    metadata:
      labels:
        app: plano-orchestrator
    spec:
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      # Optional: add a nodeSelector to pin to a specific GPU node pool.
      # The nvidia.com/gpu resource request below is sufficient for most clusters.
      # nodeSelector:
      #   DigitalOcean: doks.digitalocean.com/gpu-model: l40s
      #   GKE:          cloud.google.com/gke-accelerator: nvidia-l4
      #   EKS:          eks.amazonaws.com/nodegroup: gpu-nodes
      #   AKS:          kubernetes.azure.com/agentpool: gpupool
      initContainers:
        - name: download-model
          image: python:3.11-slim
          command:
            - sh
            - -c
            - |
              pip install huggingface_hub[cli] && \
              python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')"
          volumeMounts:
            - name: model-cache
              mountPath: /models
      containers:
        - name: vllm
          image: vllm/vllm-openai:latest
          command:
            - vllm
            - serve
            - /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf
            - "--host"
            - "0.0.0.0"
            - "--port"
            - "10000"
            - "--load-format"
            - "gguf"
            - "--tokenizer"
            - "katanemo/Arch-Router-1.5B"
            - "--served-model-name"
            - "Plano-Orchestrator"
            - "--gpu-memory-utilization"
            - "0.3"
            - "--tensor-parallel-size"
            - "1"
            - "--enable-prefix-caching"
          ports:
            - name: http
              containerPort: 10000
              protocol: TCP
          resources:
            requests:
              cpu: "1"
              memory: "4Gi"
              nvidia.com/gpu: "1"
            limits:
              cpu: "4"
              memory: "8Gi"
              nvidia.com/gpu: "1"
          volumeMounts:
            - name: model-cache
              mountPath: /models
          readinessProbe:
            httpGet:
              path: /health
              port: 10000
            initialDelaySeconds: 60
            periodSeconds: 10
          livenessProbe:
            httpGet:
              path: /health
              port: 10000
            initialDelaySeconds: 180
            periodSeconds: 30
      volumes:
        - name: model-cache
          emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
  name: plano-orchestrator
spec:
  selector:
    app: plano-orchestrator
  ports:
    - name: http
      port: 10000
      targetPort: 10000
add k8s deployment manifests and docs for self-hosted Arch-Router (#831) 2026-03-16 12:05:30 -07:00			`apiVersion: apps/v1`
			`kind: Deployment`
			`metadata:`
use plano-orchestrator for LLM routing, remove arch-router (#886) 2026-04-15 16:41:42 -07:00			`name: plano-orchestrator`
add k8s deployment manifests and docs for self-hosted Arch-Router (#831) 2026-03-16 12:05:30 -07:00			`labels:`
use plano-orchestrator for LLM routing, remove arch-router (#886) 2026-04-15 16:41:42 -07:00			`app: plano-orchestrator`
add k8s deployment manifests and docs for self-hosted Arch-Router (#831) 2026-03-16 12:05:30 -07:00			`spec:`
			`replicas: 1`
			`selector:`
			`matchLabels:`
use plano-orchestrator for LLM routing, remove arch-router (#886) 2026-04-15 16:41:42 -07:00			`app: plano-orchestrator`
add k8s deployment manifests and docs for self-hosted Arch-Router (#831) 2026-03-16 12:05:30 -07:00			`template:`
			`metadata:`
			`labels:`
use plano-orchestrator for LLM routing, remove arch-router (#886) 2026-04-15 16:41:42 -07:00			`app: plano-orchestrator`
add k8s deployment manifests and docs for self-hosted Arch-Router (#831) 2026-03-16 12:05:30 -07:00			`spec:`
			`tolerations:`
			`- key: nvidia.com/gpu`
			`operator: Exists`
			`effect: NoSchedule`
			`# Optional: add a nodeSelector to pin to a specific GPU node pool.`
			`# The nvidia.com/gpu resource request below is sufficient for most clusters.`
			`# nodeSelector:`
			`# DigitalOcean: doks.digitalocean.com/gpu-model: l40s`
			`# GKE: cloud.google.com/gke-accelerator: nvidia-l4`
			`# EKS: eks.amazonaws.com/nodegroup: gpu-nodes`
			`# AKS: kubernetes.azure.com/agentpool: gpupool`
			`initContainers:`
			`- name: download-model`
			`image: python:3.11-slim`
			`command:`
			`- sh`
			`- -c`
			`- \|`
			`pip install huggingface_hub[cli] && \`
			`python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')"`
			`volumeMounts:`
			`- name: model-cache`
			`mountPath: /models`
			`containers:`
			`- name: vllm`
			`image: vllm/vllm-openai:latest`
			`command:`
			`- vllm`
			`- serve`
			`- /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf`
			`- "--host"`
			`- "0.0.0.0"`
			`- "--port"`
			`- "10000"`
			`- "--load-format"`
			`- "gguf"`
			`- "--tokenizer"`
			`- "katanemo/Arch-Router-1.5B"`
			`- "--served-model-name"`
use plano-orchestrator for LLM routing, remove arch-router (#886) 2026-04-15 16:41:42 -07:00			`- "Plano-Orchestrator"`
add k8s deployment manifests and docs for self-hosted Arch-Router (#831) 2026-03-16 12:05:30 -07:00			`- "--gpu-memory-utilization"`
			`- "0.3"`
			`- "--tensor-parallel-size"`
			`- "1"`
			`- "--enable-prefix-caching"`
			`ports:`
			`- name: http`
			`containerPort: 10000`
			`protocol: TCP`
			`resources:`
			`requests:`
			`cpu: "1"`
			`memory: "4Gi"`
			`nvidia.com/gpu: "1"`
			`limits:`
			`cpu: "4"`
			`memory: "8Gi"`
			`nvidia.com/gpu: "1"`
			`volumeMounts:`
			`- name: model-cache`
			`mountPath: /models`
			`readinessProbe:`
			`httpGet:`
			`path: /health`
			`port: 10000`
			`initialDelaySeconds: 60`
			`periodSeconds: 10`
			`livenessProbe:`
			`httpGet:`
			`path: /health`
			`port: 10000`
			`initialDelaySeconds: 180`
			`periodSeconds: 30`
			`volumes:`
			`- name: model-cache`
			`emptyDir: {}`
			`---`
			`apiVersion: v1`
			`kind: Service`
			`metadata:`
use plano-orchestrator for LLM routing, remove arch-router (#886) 2026-04-15 16:41:42 -07:00			`name: plano-orchestrator`
add k8s deployment manifests and docs for self-hosted Arch-Router (#831) 2026-03-16 12:05:30 -07:00			`spec:`
			`selector:`
use plano-orchestrator for LLM routing, remove arch-router (#886) 2026-04-15 16:41:42 -07:00			`app: plano-orchestrator`
add k8s deployment manifests and docs for self-hosted Arch-Router (#831) 2026-03-16 12:05:30 -07:00			`ports:`
			`- name: http`
			`port: 10000`
			`targetPort: 10000`