mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-07-04 10:52:17 +02:00
Merge branch 'main' of https://github.com/dograh-hq/dograh
This commit is contained in:
commit
4f06f45170
27 changed files with 403 additions and 404 deletions
|
|
@ -71,39 +71,16 @@ silent. Each is exposed in `values.yaml` for operator override.
|
|||
|
||||
## `/tmp` audit (review fix #6)
|
||||
|
||||
The current docker-compose mounts a `shared-tmp` volume across all
|
||||
logical services so file handoffs between processes Just Work. In
|
||||
Kubernetes with separated pods this is broken by default.
|
||||
|
||||
**Findings:**
|
||||
|
||||
| File | Process | Behavior | Cross-pod? |
|
||||
|------|---------|----------|------------|
|
||||
| `api/services/pipecat/event_handlers.py` (lines 364–383) | **web** | Writes WAV + transcript via `NamedTemporaryFile`, then `enqueue_job(...)` to ARQ with the local path | **YES — broken** |
|
||||
| `api/tasks/s3_upload.py` | **arq-worker** | Receives `temp_file_path`, `os.path.exists`, uploads, deletes | **reads from web's path** |
|
||||
| `api/services/pipecat/in_memory_buffers.py` | web | Writes tempfiles consumed in the same process | No |
|
||||
| `api/services/pipecat/audio_file_cache.py` | web | Per-process cache | No |
|
||||
| `api/tasks/knowledge_base_processing.py` | arq-worker | Writes + reads in the same task | No |
|
||||
|
||||
**Mitigation in this chart:** `sharedTmp.enabled` flag in `values.yaml`.
|
||||
When enabled, the chart creates a `ReadWriteMany` PVC mounted into
|
||||
both `dograh-web` and `dograh-arq-worker` at
|
||||
`/tmp/dograh-shared/`. Default is `enabled: false` because most
|
||||
cloud-default storage classes are RWO; enabling it on RWO will fail
|
||||
PVC binding.
|
||||
|
||||
**If your cluster lacks an RWX storage class** (most cloud defaults are
|
||||
RWO), you MUST either:
|
||||
- provision an RWX class (EFS, Azure Files, Longhorn-RWX, Rook-Ceph) and
|
||||
set `sharedTmp.storageClassName`, or
|
||||
- complete the long-term fix in TODOs below before splitting web/worker.
|
||||
Resolved. End-of-call artifacts (recordings, transcript) are uploaded to
|
||||
object storage directly from the web process
|
||||
(`api/services/workflow_run_artifacts.py`) before the ARQ completion job
|
||||
is enqueued; the job carries only the workflow run id. No file handoff
|
||||
crosses a pod boundary, so web and arq-worker pods need no shared
|
||||
volume. The remaining `/tmp` uses (`audio_file_cache.py`,
|
||||
`knowledge_base_processing.py`) write and read within a single process.
|
||||
|
||||
## Open TODOs (deferred from v1)
|
||||
|
||||
- **Refactor `event_handlers.py` to handle uploads in-web.** Upload to
|
||||
object storage from the web process and pass the resulting storage
|
||||
key (not a local path) to the ARQ job. This removes the need for a
|
||||
shared `/tmp` PVC entirely.
|
||||
- **Leader election for singletons.** Adopt Kubernetes lease-based
|
||||
leader election so `ari-manager` / `campaign-orchestrator` can run
|
||||
HA. Until then, replicas remain hard-coded to 1.
|
||||
|
|
@ -162,7 +139,6 @@ deploy/helm/dograh/
|
|||
├── configmap.yaml
|
||||
├── secret.yaml
|
||||
├── migrate-job.yaml
|
||||
├── shared-tmp-pvc.yaml
|
||||
├── web-deployment.yaml
|
||||
├── web-service.yaml
|
||||
├── web-hpa.yaml
|
||||
|
|
|
|||
|
|
@ -57,18 +57,6 @@ Alembic migrations run as a post-install / pre-upgrade hook. Inspect with:
|
|||
kubectl logs job/{{ include "dograh.migrate.fullname" . }} -n {{ .Release.Namespace }}
|
||||
{{- end }}
|
||||
|
||||
=== /tmp shared volume ===
|
||||
{{- if .Values.sharedTmp.enabled }}
|
||||
sharedTmp.enabled=true — web and arq-worker pods mount a ReadWriteMany PVC
|
||||
at {{ .Values.sharedTmp.mountPath }} so end-of-call uploads survive pod boundaries.
|
||||
{{- else }}
|
||||
WARNING: sharedTmp.enabled=false. End-of-call uploads (event_handlers.py →
|
||||
ARQ s3_upload) hand off via /tmp paths. With separated web and worker pods
|
||||
this WILL fail unless you have an RWX storage class configured.
|
||||
|
||||
See deploy/helm/dograh/README.md "/tmp audit" section.
|
||||
{{- end }}
|
||||
|
||||
=== Singletons ===
|
||||
ari-manager and campaign-orchestrator run with replicas=1 and
|
||||
strategy=Recreate by design. Do NOT scale these via kubectl scale —
|
||||
|
|
|
|||
|
|
@ -227,21 +227,3 @@ are added inline because they may need composition from subchart secrets.
|
|||
- secretRef:
|
||||
name: {{ include "dograh.secretName" . }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Volume mounts for the shared-tmp PVC when enabled.
|
||||
*/}}
|
||||
{{- define "dograh.sharedTmpVolumeMounts" -}}
|
||||
{{- if .Values.sharedTmp.enabled }}
|
||||
- name: shared-tmp
|
||||
mountPath: {{ .Values.sharedTmp.mountPath }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{- define "dograh.sharedTmpVolumes" -}}
|
||||
{{- if .Values.sharedTmp.enabled }}
|
||||
- name: shared-tmp
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "dograh.fullname" . }}-shared-tmp
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -45,10 +45,6 @@ spec:
|
|||
{{- toYaml .Values.workers.livenessProbe | nindent 12 }}
|
||||
resources:
|
||||
{{- toYaml .Values.workers.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
{{- include "dograh.sharedTmpVolumeMounts" . | nindent 12 }}
|
||||
volumes:
|
||||
{{- include "dograh.sharedTmpVolumes" . | nindent 8 }}
|
||||
{{- with .Values.workers.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
|
|
|
|||
|
|
@ -1,18 +0,0 @@
|
|||
{{- if .Values.sharedTmp.enabled }}
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "dograh.fullname" . }}-shared-tmp
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "dograh.labels" . | nindent 4 }}
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
{{- if .Values.sharedTmp.storageClassName }}
|
||||
storageClassName: {{ .Values.sharedTmp.storageClassName | quote }}
|
||||
{{- end }}
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.sharedTmp.size }}
|
||||
{{- end }}
|
||||
|
|
@ -68,10 +68,6 @@ spec:
|
|||
command: ["sh", "-c", "sleep {{ .Values.web.preStopSleepSeconds }}"]
|
||||
resources:
|
||||
{{- toYaml .Values.web.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
{{- include "dograh.sharedTmpVolumeMounts" . | nindent 12 }}
|
||||
volumes:
|
||||
{{- include "dograh.sharedTmpVolumes" . | nindent 8 }}
|
||||
{{- with .Values.web.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
|
|
|
|||
|
|
@ -150,28 +150,6 @@ secrets:
|
|||
langfusePublicKey: ""
|
||||
langfuseHost: ""
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Shared /tmp PVC.
|
||||
#
|
||||
# AUDIT FINDING: api/services/pipecat/event_handlers.py writes WAV/transcript
|
||||
# tempfiles in the web process and enqueues an ARQ job that reads those exact
|
||||
# paths in the worker (api/tasks/s3_upload.py). In compose this works because
|
||||
# all processes share the `shared-tmp` volume. In Kubernetes web and worker run
|
||||
# in separate pods. Options:
|
||||
# 1. Enable this PVC (ReadWriteMany required) to mount /tmp/dograh-shared
|
||||
# into both web and arq-worker pods. Use this for v1.
|
||||
# 2. Refactor event_handlers.py to upload from the web process and pass a
|
||||
# storage key (not a local path) to the ARQ job. Preferred long-term;
|
||||
# see deploy/helm/dograh/README.md "Open TODOs".
|
||||
# If your cluster lacks RWX (most cloud default storage classes are RWO),
|
||||
# you MUST take option (2) before splitting web and worker pods, or end-of-
|
||||
# call uploads will fail silently.
|
||||
sharedTmp:
|
||||
enabled: false
|
||||
storageClassName: "" # must be an RWX-capable class (e.g. efs-sc, azurefile, longhorn-rwx)
|
||||
size: 10Gi
|
||||
mountPath: /tmp/dograh-shared
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Web tier (FastAPI + WebSocket signaling)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -138,8 +138,6 @@ services:
|
|||
api:
|
||||
image: ${REGISTRY:-dograhai}/dograh-api:${DOGRAH_VERSION:-latest}
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- shared-tmp:/tmp
|
||||
environment:
|
||||
# production => drop private-IP host ICE candidates on a public VPS and
|
||||
# order TURN URIs UDP-first. Required for correct remote WebRTC.
|
||||
|
|
@ -266,8 +264,6 @@ volumes:
|
|||
redis_data:
|
||||
minio-data:
|
||||
driver: local
|
||||
shared-tmp:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
# Internal network for service-to-service traffic (db, redis, minio, coturn).
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue