plano/demos/llm_routing/model_routing_service/vllm-deployment.yaml

105 lines
2.9 KiB
YAML
Raw Normal View History

apiVersion: apps/v1
kind: Deployment
metadata:
name: plano-orchestrator
labels:
app: plano-orchestrator
spec:
replicas: 1
selector:
matchLabels:
app: plano-orchestrator
template:
metadata:
labels:
app: plano-orchestrator
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Optional: add a nodeSelector to pin to a specific GPU node pool.
# The nvidia.com/gpu resource request below is sufficient for most clusters.
# nodeSelector:
# DigitalOcean: doks.digitalocean.com/gpu-model: l40s
# GKE: cloud.google.com/gke-accelerator: nvidia-l4
# EKS: eks.amazonaws.com/nodegroup: gpu-nodes
# AKS: kubernetes.azure.com/agentpool: gpupool
initContainers:
- name: download-model
image: python:3.11-slim
command:
- sh
- -c
- |
pip install huggingface_hub[cli] && \
python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')"
volumeMounts:
- name: model-cache
mountPath: /models
containers:
- name: vllm
image: vllm/vllm-openai:latest
command:
- vllm
- serve
- /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf
- "--host"
- "0.0.0.0"
- "--port"
- "10000"
- "--load-format"
- "gguf"
- "--tokenizer"
- "katanemo/Arch-Router-1.5B"
- "--served-model-name"
- "Plano-Orchestrator"
- "--gpu-memory-utilization"
- "0.3"
- "--tensor-parallel-size"
- "1"
- "--enable-prefix-caching"
ports:
- name: http
containerPort: 10000
protocol: TCP
resources:
requests:
cpu: "1"
memory: "4Gi"
nvidia.com/gpu: "1"
limits:
cpu: "4"
memory: "8Gi"
nvidia.com/gpu: "1"
volumeMounts:
- name: model-cache
mountPath: /models
readinessProbe:
httpGet:
path: /health
port: 10000
initialDelaySeconds: 60
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: 10000
initialDelaySeconds: 180
periodSeconds: 30
volumes:
- name: model-cache
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: plano-orchestrator
spec:
selector:
app: plano-orchestrator
ports:
- name: http
port: 10000
targetPort: 10000