apiVersion: apps/v1 kind: Deployment metadata: name: plano-orchestrator labels: app: plano-orchestrator spec: replicas: 1 selector: matchLabels: app: plano-orchestrator template: metadata: labels: app: plano-orchestrator spec: tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule # Optional: add a nodeSelector to pin to a specific GPU node pool. # The nvidia.com/gpu resource request below is sufficient for most clusters. # nodeSelector: # DigitalOcean: doks.digitalocean.com/gpu-model: l40s # GKE: cloud.google.com/gke-accelerator: nvidia-l4 # EKS: eks.amazonaws.com/nodegroup: gpu-nodes # AKS: kubernetes.azure.com/agentpool: gpupool initContainers: - name: download-model image: python:3.11-slim command: - sh - -c - | pip install huggingface_hub[cli] && \ python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')" volumeMounts: - name: model-cache mountPath: /models containers: - name: vllm image: vllm/vllm-openai:latest command: - vllm - serve - /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf - "--host" - "0.0.0.0" - "--port" - "10000" - "--load-format" - "gguf" - "--tokenizer" - "katanemo/Arch-Router-1.5B" - "--served-model-name" - "Plano-Orchestrator" - "--gpu-memory-utilization" - "0.3" - "--tensor-parallel-size" - "1" - "--enable-prefix-caching" ports: - name: http containerPort: 10000 protocol: TCP resources: requests: cpu: "1" memory: "4Gi" nvidia.com/gpu: "1" limits: cpu: "4" memory: "8Gi" nvidia.com/gpu: "1" volumeMounts: - name: model-cache mountPath: /models readinessProbe: httpGet: path: /health port: 10000 initialDelaySeconds: 60 periodSeconds: 10 livenessProbe: httpGet: path: /health port: 10000 initialDelaySeconds: 180 periodSeconds: 30 volumes: - name: model-cache emptyDir: {} --- apiVersion: v1 kind: Service metadata: name: plano-orchestrator spec: selector: app: plano-orchestrator ports: - name: http port: 10000 targetPort: 10000