add k8s deployment manifests and docs for self-hosted Arch-Router

2026-06-02 14:35:14 +02:00 · 2026-03-16 11:08:07 -07:00 · 2026-03-16 11:08:07 -07:00 · 5b58bb60c3
commit 5b58bb60c3
parent f1b8c03e2f
7 changed files with 381 additions and 342 deletions
--- a/demos/llm_routing/model_routing_service/vllm-deployment.yaml
+++ b/demos/llm_routing/model_routing_service/vllm-deployment.yaml
@ -0,0 +1,104 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: arch-router
+  labels:
+    app: arch-router
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: arch-router
+  template:
+    metadata:
+      labels:
+        app: arch-router
+    spec:
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      nodeSelector:
+        # Replace with the label that identifies GPU nodes in your cluster
+        # Examples:
+        #   GKE: cloud.google.com/gke-accelerator: nvidia-l4
+        #   EKS: eks.amazonaws.com/nodegroup: gpu-nodes
+        #   AKS: kubernetes.azure.com/agentpool: gpupool
+        node.kubernetes.io/instance-type: gpu-node
+      initContainers:
+        - name: download-model
+          image: python:3.11-slim
+          command:
+            - sh
+            - -c
+            - |
+              pip install huggingface_hub[cli] && \
+              python -c "from huggingface_hub import snapshot_download; snapshot_download('katanemo/Arch-Router-1.5B.gguf', local_dir='/models/Arch-Router-1.5B.gguf')"
+          volumeMounts:
+            - name: model-cache
+              mountPath: /models
+      containers:
+        - name: vllm
+          image: vllm/vllm-openai:latest
+          command:
+            - vllm
+            - serve
+            - /models/Arch-Router-1.5B.gguf/Arch-Router-1.5B-Q4_K_M.gguf
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - "10000"
+            - "--load-format"
+            - "gguf"
+            - "--tokenizer"
+            - "katanemo/Arch-Router-1.5B"
+            - "--served-model-name"
+            - "Arch-Router"
+            - "--gpu-memory-utilization"
+            - "0.3"
+            - "--tensor-parallel-size"
+            - "1"
+            - "--enable-prefix-caching"
+          ports:
+            - name: http
+              containerPort: 10000
+              protocol: TCP
+          resources:
+            requests:
+              cpu: "1"
+              memory: "4Gi"
+              nvidia.com/gpu: "1"
+            limits:
+              cpu: "4"
+              memory: "8Gi"
+              nvidia.com/gpu: "1"
+          volumeMounts:
+            - name: model-cache
+              mountPath: /models
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 10000
+            initialDelaySeconds: 60
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 10000
+            initialDelaySeconds: 180
+            periodSeconds: 30
+      volumes:
+        - name: model-cache
+          emptyDir: {}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: arch-router
+spec:
+  selector:
+    app: arch-router
+  ports:
+    - name: http
+      port: 10000
+      targetPort: 10000