mirror of
https://github.com/katanemo/plano.git
synced 2026-06-02 14:35:14 +02:00
add k8s deployment manifests and docs for self-hosted Arch-Router
This commit is contained in:
parent
f1b8c03e2f
commit
5b58bb60c3
7 changed files with 381 additions and 342 deletions
104
demos/llm_routing/model_routing_service/vllm-deployment.yaml
Normal file
104
demos/llm_routing/model_routing_service/vllm-deployment.yaml
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: arch-router
|
||||
labels:
|
||||
app: arch-router
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: arch-router
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: arch-router
|
||||
spec:
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
nodeSelector:
|
||||
# Replace with the label that identifies GPU nodes in your cluster
|
||||
# Examples:
|
||||
# GKE: cloud.google.com/gke-accelerator: nvidia-l4
|
||||
# EKS: eks.amazonaws.com/nodegroup: gpu-nodes
|
||||
# AKS: kubernetes.azure.com/agentpool: gpupool
|
||||
node.kubernetes.io/instance-type: gpu-node
|
||||
initContainers:
|
||||
- name: download-model
|
||||
image: python:3.11-slim
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
pip install huggingface_hub[cli] && \
|
||||
python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')"
|
||||
volumeMounts:
|
||||
- name: model-cache
|
||||
mountPath: /models
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
command:
|
||||
- vllm
|
||||
- serve
|
||||
- /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf
|
||||
- "--host"
|
||||
- "0.0.0.0"
|
||||
- "--port"
|
||||
- "10000"
|
||||
- "--load-format"
|
||||
- "gguf"
|
||||
- "--tokenizer"
|
||||
- "katanemo/Arch-Router-1.5B"
|
||||
- "--served-model-name"
|
||||
- "Arch-Router"
|
||||
- "--gpu-memory-utilization"
|
||||
- "0.3"
|
||||
- "--tensor-parallel-size"
|
||||
- "1"
|
||||
- "--enable-prefix-caching"
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 10000
|
||||
protocol: TCP
|
||||
resources:
|
||||
requests:
|
||||
cpu: "1"
|
||||
memory: "4Gi"
|
||||
nvidia.com/gpu: "1"
|
||||
limits:
|
||||
cpu: "4"
|
||||
memory: "8Gi"
|
||||
nvidia.com/gpu: "1"
|
||||
volumeMounts:
|
||||
- name: model-cache
|
||||
mountPath: /models
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 10000
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 10000
|
||||
initialDelaySeconds: 180
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: model-cache
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: arch-router
|
||||
spec:
|
||||
selector:
|
||||
app: arch-router
|
||||
ports:
|
||||
- name: http
|
||||
port: 10000
|
||||
targetPort: 10000
|
||||
Loading…
Add table
Add a link
Reference in a new issue