From 88f4477edb7f52ebf200678b1593e468a49dedb8 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Fri, 3 Jul 2026 12:39:39 +0530 Subject: [PATCH] feat: add Helm chart for Kubernetes deployment (#365) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add Helm chart for Kubernetes deployment Co-Authored-By: Claude Opus 4.7 (1M context) * Replace bundled Bitnami subcharts with in-chart manifests on official images The Bitnami catalog removed all versioned image tags from docker.io/bitnami in Aug 2025 (old images frozen in bitnamilegacy, maintained catalog now behind a Broadcom subscription), so the bundled postgresql/redis/minio subcharts no longer pull. Replace them with plain in-chart manifests built on official upstream images, keeping the internal/all-in-one path fully self-contained and free of third-party chart packaging that can disappear: - internal-postgres.yaml: pgvector/pgvector:pg17 — upstream Postgres plus the `vector` extension the migrations require. POSTGRES_USER=dograh is the initdb superuser, so CREATE EXTENSION vector succeeds. - internal-redis.yaml: redis:7.4-alpine, password-protected, AOF persistence. - internal-minio.yaml: minio/minio, root creds shared with the app via a single secret (can't drift); the app auto-creates its bucket. Service/secret names are unchanged (-postgresql, -redisinternal-master, -minio) so the app wiring is untouched. Dep passwords are generated once and persisted across upgrades via lookup. Drop the Chart.yaml dependencies, Chart.lock, and the `helm dependency` step; the internal manifests gate on the mode toggles (database.mode=internal, etc.). Also fixes surfaced by smoke-testing on a live EKS cluster: - Dockerfile: ship the per-service run_*.sh entrypoints the chart invokes. - migrate-job: run as a post-install/pre-upgrade hook (the bundled Postgres does not exist during pre-install) with a wait-for-postgres init container. - backend env: declare POSTGRES_PASSWORD/REDIS_PASSWORD before the DATABASE_URL/ REDIS_URL that interpolate them (Kubernetes only expands back-references). - worker liveness probes: pgrep isn't in the slim runtime image; check /proc/1/cmdline instead (each worker execs its process as PID 1). - UI: set HOSTNAME=0.0.0.0 so Next.js standalone doesn't bind to the k8s-injected pod name (which maps to the pod IP only, breaking port-forward/loopback). Verified end-to-end on EKS 1.36: all pods Ready, migrations applied (pgvector extension + 27 tables), UI login page and web API served via port-forward. Co-Authored-By: Claude Opus 4.8 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- api/Dockerfile | 17 +- deploy/helm/dograh/.gitignore | 3 + deploy/helm/dograh/Chart.yaml | 37 ++ deploy/helm/dograh/README.md | 184 ++++++ deploy/helm/dograh/examples/values-aws.yaml | 66 +++ .../helm/dograh/examples/values-managed.yaml | 68 +++ .../dograh/examples/values-single-node.yaml | 59 ++ deploy/helm/dograh/templates/NOTES.txt | 92 +++ deploy/helm/dograh/templates/_helpers.tpl | 247 ++++++++ .../templates/ari-manager-deployment.yaml | 64 ++ .../templates/arq-worker-deployment.yaml | 63 ++ .../campaign-orchestrator-deployment.yaml | 64 ++ deploy/helm/dograh/templates/configmap.yaml | 38 ++ .../dograh/templates/coturn-configmap.yaml | 37 ++ .../dograh/templates/coturn-deployment.yaml | 98 ++++ .../helm/dograh/templates/coturn-service.yaml | 47 ++ deploy/helm/dograh/templates/gateway.yaml | 34 ++ .../helm/dograh/templates/httproute-api.yaml | 33 ++ .../dograh/templates/httproute-minio.yaml | 46 ++ .../helm/dograh/templates/httproute-ui.yaml | 33 ++ deploy/helm/dograh/templates/ingress.yaml | 66 +++ .../helm/dograh/templates/internal-minio.yaml | 137 +++++ .../dograh/templates/internal-postgres.yaml | 118 ++++ .../helm/dograh/templates/internal-redis.yaml | 115 ++++ deploy/helm/dograh/templates/migrate-job.yaml | 68 +++ deploy/helm/dograh/templates/secret.yaml | 35 ++ .../helm/dograh/templates/serviceaccount.yaml | 13 + .../helm/dograh/templates/shared-tmp-pvc.yaml | 18 + .../helm/dograh/templates/ui-deployment.yaml | 80 +++ deploy/helm/dograh/templates/ui-pdb.yaml | 16 + deploy/helm/dograh/templates/ui-service.yaml | 20 + .../helm/dograh/templates/web-deployment.yaml | 90 +++ deploy/helm/dograh/templates/web-hpa.yaml | 38 ++ deploy/helm/dograh/templates/web-pdb.yaml | 16 + deploy/helm/dograh/templates/web-service.yaml | 22 + deploy/helm/dograh/values.schema.json | 48 ++ deploy/helm/dograh/values.yaml | 554 ++++++++++++++++++ scripts/run_ari_manager.sh | 12 + scripts/run_arq_worker.sh | 12 + scripts/run_campaign_orchestrator.sh | 12 + scripts/run_migrate.sh | 12 + scripts/run_web.sh | 14 + 42 files changed, 2845 insertions(+), 1 deletion(-) create mode 100644 deploy/helm/dograh/.gitignore create mode 100644 deploy/helm/dograh/Chart.yaml create mode 100644 deploy/helm/dograh/README.md create mode 100644 deploy/helm/dograh/examples/values-aws.yaml create mode 100644 deploy/helm/dograh/examples/values-managed.yaml create mode 100644 deploy/helm/dograh/examples/values-single-node.yaml create mode 100644 deploy/helm/dograh/templates/NOTES.txt create mode 100644 deploy/helm/dograh/templates/_helpers.tpl create mode 100644 deploy/helm/dograh/templates/ari-manager-deployment.yaml create mode 100644 deploy/helm/dograh/templates/arq-worker-deployment.yaml create mode 100644 deploy/helm/dograh/templates/campaign-orchestrator-deployment.yaml create mode 100644 deploy/helm/dograh/templates/configmap.yaml create mode 100644 deploy/helm/dograh/templates/coturn-configmap.yaml create mode 100644 deploy/helm/dograh/templates/coturn-deployment.yaml create mode 100644 deploy/helm/dograh/templates/coturn-service.yaml create mode 100644 deploy/helm/dograh/templates/gateway.yaml create mode 100644 deploy/helm/dograh/templates/httproute-api.yaml create mode 100644 deploy/helm/dograh/templates/httproute-minio.yaml create mode 100644 deploy/helm/dograh/templates/httproute-ui.yaml create mode 100644 deploy/helm/dograh/templates/ingress.yaml create mode 100644 deploy/helm/dograh/templates/internal-minio.yaml create mode 100644 deploy/helm/dograh/templates/internal-postgres.yaml create mode 100644 deploy/helm/dograh/templates/internal-redis.yaml create mode 100644 deploy/helm/dograh/templates/migrate-job.yaml create mode 100644 deploy/helm/dograh/templates/secret.yaml create mode 100644 deploy/helm/dograh/templates/serviceaccount.yaml create mode 100644 deploy/helm/dograh/templates/shared-tmp-pvc.yaml create mode 100644 deploy/helm/dograh/templates/ui-deployment.yaml create mode 100644 deploy/helm/dograh/templates/ui-pdb.yaml create mode 100644 deploy/helm/dograh/templates/ui-service.yaml create mode 100644 deploy/helm/dograh/templates/web-deployment.yaml create mode 100644 deploy/helm/dograh/templates/web-hpa.yaml create mode 100644 deploy/helm/dograh/templates/web-pdb.yaml create mode 100644 deploy/helm/dograh/templates/web-service.yaml create mode 100644 deploy/helm/dograh/values.schema.json create mode 100644 deploy/helm/dograh/values.yaml create mode 100755 scripts/run_ari_manager.sh create mode 100755 scripts/run_arq_worker.sh create mode 100755 scripts/run_campaign_orchestrator.sh create mode 100755 scripts/run_migrate.sh create mode 100755 scripts/run_web.sh diff --git a/api/Dockerfile b/api/Dockerfile index 1a8d48dc..2ef3ddc8 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -141,7 +141,22 @@ ENV PYTHONUNBUFFERED=1 # Copy application code (chown at copy-time avoids a duplicate /app layer # from a later `RUN chown -R`, which would double the on-disk size of /app). COPY --chown=dograh:dograh ./api ./api -COPY --chown=dograh:dograh ./scripts/start_services_docker.sh ./scripts/start_services_docker.sh + +# Entrypoint scripts. +# start_services_docker.sh — single-container (docker-compose) entrypoint +# that runs every service in one process tree. +# run_*.sh — per-service entrypoints used by the Helm chart, +# which runs each workload (web, arq-worker, ari-manager, +# campaign-orchestrator, migrate) as its own pod. Keep this list in sync +# with the command:[] entries in deploy/helm/dograh/templates/*.yaml. +COPY --chown=dograh:dograh \ + ./scripts/start_services_docker.sh \ + ./scripts/run_migrate.sh \ + ./scripts/run_web.sh \ + ./scripts/run_arq_worker.sh \ + ./scripts/run_ari_manager.sh \ + ./scripts/run_campaign_orchestrator.sh \ + ./scripts/ # ts_validator Node deps (built in ts-deps stage with full node:22-slim image). # The validator runs as a short-lived subprocess from api/mcp_server/ts_bridge.py. diff --git a/deploy/helm/dograh/.gitignore b/deploy/helm/dograh/.gitignore new file mode 100644 index 00000000..a097806d --- /dev/null +++ b/deploy/helm/dograh/.gitignore @@ -0,0 +1,3 @@ +# Subchart tarballs are fetched by `helm dependency build` from Chart.lock. +# Tracked in Chart.lock; not in git. +charts/ diff --git a/deploy/helm/dograh/Chart.yaml b/deploy/helm/dograh/Chart.yaml new file mode 100644 index 00000000..607d2746 --- /dev/null +++ b/deploy/helm/dograh/Chart.yaml @@ -0,0 +1,37 @@ +apiVersion: v2 +name: dograh +description: | + Dograh — open-source voice AI platform. Deploys the FastAPI backend + (decomposed into web, ARQ worker, ARI manager singleton, and campaign + orchestrator singleton), the Next.js UI, and coturn for WebRTC media + relay. Optional bundled PostgreSQL, Redis, and MinIO via subcharts. +type: application + +# version: chart version. Bump for any chart change. +# appVersion: Dograh application version. Tracks the image tag. +version: 0.1.0 +appVersion: "0.1.0" + +kubeVersion: ">=1.28.0-0" + +keywords: + - voice-ai + - webrtc + - telephony + - fastapi + +home: https://dograh.com +sources: + - https://github.com/dograh-hq/dograh + +maintainers: + - name: Dograh + +# The bundled stateful deps (PostgreSQL, Redis, MinIO) for the internal/all-in-one +# modes are plain in-chart manifests (templates/internal-*.yaml) built on official +# upstream images — NOT subcharts. This avoids depending on third-party chart +# packaging/registries that can change or disappear (as the Bitnami catalog did in +# Aug 2025). Production deployments should use the external/managed modes +# (database.mode=external, redis.mode=external, storage.mode=s3|externalMinio). +# +# dependencies: [] diff --git a/deploy/helm/dograh/README.md b/deploy/helm/dograh/README.md new file mode 100644 index 00000000..31308333 --- /dev/null +++ b/deploy/helm/dograh/README.md @@ -0,0 +1,184 @@ +# Dograh Helm chart + +Deploys Dograh on Kubernetes with decomposed backend workloads (web, +ARQ workers, telephony singleton, campaign singleton), Next.js UI, and +coturn for WebRTC media relay. Implements the architecture defined in +`HELM_DEPLOYMENT_PLAN.md` at the repo root. + +## Status + +v1, alpha. Validated with `helm lint` and `helm template`. Not yet +exercised against a live cluster. + +## Quick start + +```bash +cd deploy/helm/dograh + +# Install with defaults (all internal deps, Gateway API exposure). +# The bundled Postgres/Redis/MinIO are in-chart manifests on official upstream +# images — no `helm dependency` / subchart pull step needed. +helm install dograh . \ + --set secrets.ossJwtSecret="$(openssl rand -hex 32)" \ + --set secrets.turnSecret="$(openssl rand -hex 32)" \ + --set exposure.gatewayApi.gatewayClassName=istio +``` + +See `examples/values-single-node.yaml`, `examples/values-managed.yaml`, +and `examples/values-aws.yaml` for topology-specific overrides. + +## Architecture summary + +| Workload | Replicas | Strategy | Notes | +|------------------------------|-------------|-----------------|-------| +| `dograh-web` | 2 (HPA opt) | RollingUpdate | Long-lived WS, graceful drain | +| `dograh-arq-worker` | 1 (knob) | RollingUpdate | Stateless | +| `dograh-ari-manager` | **1 fixed** | **Recreate** | Telephony singleton | +| `dograh-campaign-orchestrator` | **1 fixed** | **Recreate** | Campaign singleton (in-memory locks) | +| `dograh-ui` | 2 | RollingUpdate | Next.js SSR | +| `dograh-coturn` | 1 | Recreate | LoadBalancer Service, port-pinned | + +HTTP traffic: Gateway API (default) or Ingress (fallback). +TURN traffic: dedicated L4 Service of type `LoadBalancer`. + +## Decisions log + +These are choices the chart made where `HELM_DEPLOYMENT_PLAN.md` was +silent. Each is exposed in `values.yaml` for operator override. + +- **terminationGracePeriodSeconds for web: 600s.** Covers a 10-minute + call; tune to your call-length distribution. +- **preStop sleep: 15s.** Conservative window for the gateway/ingress + to observe pod NotReady and stop dispatching new connections. +- **Liveness probes on singletons: `exec` (`pgrep`).** No HTTP endpoint + exists on ari-manager / campaign-orchestrator; process-alive check is + the simplest correct signal. +- **HPA on web: CPU/memory, disabled by default.** Plan recommends HPA + but CPU/memory is a poor signal for WS workloads. Default + `autoscaling.web.enabled=false`; flip on with a knowing eye and plan + to replace with a connection-count metric. +- **Singleton replica counts: hard-coded.** No `replicaCount` knob + exposed on ari-manager / campaign-orchestrator. Prevents accidental + `kubectl scale` corrupting in-memory dedup state. +- **MinIO browser exposure: shared host, path prefix `/voice-audio/`.** + Mirrors current nginx behavior. Operators wanting a separate + hostname can override by editing `httproute-minio.yaml` or + `ingress.yaml` post-install. +- **NetworkPolicy: not in v1.** TODO below. +- **ServiceMonitor / Prometheus: not in v1.** TODO below. +- **TURN TLS (turns://): not in v1.** Original docker-compose exposed + port 5349 but never wired certs. Chart scopes v1 to plain TURN. + +## `/tmp` audit (review fix #6) + +The current docker-compose mounts a `shared-tmp` volume across all +logical services so file handoffs between processes Just Work. In +Kubernetes with separated pods this is broken by default. + +**Findings:** + +| File | Process | Behavior | Cross-pod? | +|------|---------|----------|------------| +| `api/services/pipecat/event_handlers.py` (lines 364–383) | **web** | Writes WAV + transcript via `NamedTemporaryFile`, then `enqueue_job(...)` to ARQ with the local path | **YES — broken** | +| `api/tasks/s3_upload.py` | **arq-worker** | Receives `temp_file_path`, `os.path.exists`, uploads, deletes | **reads from web's path** | +| `api/services/pipecat/in_memory_buffers.py` | web | Writes tempfiles consumed in the same process | No | +| `api/services/pipecat/audio_file_cache.py` | web | Per-process cache | No | +| `api/tasks/knowledge_base_processing.py` | arq-worker | Writes + reads in the same task | No | + +**Mitigation in this chart:** `sharedTmp.enabled` flag in `values.yaml`. +When enabled, the chart creates a `ReadWriteMany` PVC mounted into +both `dograh-web` and `dograh-arq-worker` at +`/tmp/dograh-shared/`. Default is `enabled: false` because most +cloud-default storage classes are RWO; enabling it on RWO will fail +PVC binding. + +**If your cluster lacks an RWX storage class** (most cloud defaults are +RWO), you MUST either: +- provision an RWX class (EFS, Azure Files, Longhorn-RWX, Rook-Ceph) and + set `sharedTmp.storageClassName`, or +- complete the long-term fix in TODOs below before splitting web/worker. + +## Open TODOs (deferred from v1) + +- **Refactor `event_handlers.py` to handle uploads in-web.** Upload to + object storage from the web process and pass the resulting storage + key (not a local path) to the ARQ job. This removes the need for a + shared `/tmp` PVC entirely. +- **Leader election for singletons.** Adopt Kubernetes lease-based + leader election so `ari-manager` / `campaign-orchestrator` can run + HA. Until then, replicas remain hard-coded to 1. +- **Connection-count HPA metric.** Expose active WS sessions per pod + (Prometheus or KEDA) and replace CPU/memory HPA target. +- **NetworkPolicy.** Add default-deny + explicit egress to Postgres, + Redis, MinIO/S3, and (for ari-manager) Asterisk. +- **ServiceMonitor.** First-class Prometheus integration once + observability stack is selected. +- **TURN TLS (turns://).** Wire certificate paths through coturn config + and document the cert-manager pattern. +- **MinIO public route via separate hostname.** Make `/voice-audio/` + path-prefix the default but allow operators to opt into a dedicated + hostname. +- **KEDA for ARQ workers.** When a queue-depth metric is available, + switch ARQ from fixed replicas to KEDA-driven scaling. + +## Validation + +```bash +cd deploy/helm/dograh + +helm lint . +helm template test-release . > /tmp/render-default.yaml +helm template test-release . -f examples/values-single-node.yaml > /tmp/render-single.yaml +helm template test-release . -f examples/values-managed.yaml > /tmp/render-managed.yaml +helm template test-release . -f examples/values-aws.yaml > /tmp/render-aws.yaml +``` + +Spot-check expectations: +- `Deployment/-ari-manager` has `replicas: 1` and + `strategy.type: Recreate`. +- `Deployment/-campaign-orchestrator` has `replicas: 1` and + `strategy.type: Recreate`. +- `Deployment/-web` has `terminationGracePeriodSeconds: 600` + and a `lifecycle.preStop` exec hook. +- Liveness probe on ari-manager / campaign-orchestrator uses `exec`, + not `httpGet`. + +## Layout + +``` +deploy/helm/dograh/ +├── Chart.yaml +├── values.yaml # heavily commented +├── values.schema.json # enforces mode enums +├── README.md # this file +├── examples/ +│ ├── values-single-node.yaml +│ ├── values-managed.yaml +│ └── values-aws.yaml +└── templates/ + ├── _helpers.tpl + ├── NOTES.txt + ├── serviceaccount.yaml + ├── configmap.yaml + ├── secret.yaml + ├── migrate-job.yaml + ├── shared-tmp-pvc.yaml + ├── web-deployment.yaml + ├── web-service.yaml + ├── web-hpa.yaml + ├── web-pdb.yaml + ├── arq-worker-deployment.yaml + ├── ari-manager-deployment.yaml + ├── campaign-orchestrator-deployment.yaml + ├── ui-deployment.yaml + ├── ui-service.yaml + ├── ui-pdb.yaml + ├── coturn-deployment.yaml + ├── coturn-service.yaml + ├── coturn-configmap.yaml + ├── gateway.yaml + ├── httproute-api.yaml + ├── httproute-ui.yaml + ├── httproute-minio.yaml + └── ingress.yaml +``` diff --git a/deploy/helm/dograh/examples/values-aws.yaml b/deploy/helm/dograh/examples/values-aws.yaml new file mode 100644 index 00000000..39175b36 --- /dev/null +++ b/deploy/helm/dograh/examples/values-aws.yaml @@ -0,0 +1,66 @@ +# AWS EKS — uses ALB (via AWS Gateway API controller) for HTTP and NLB +# for coturn. Assumes: +# - aws-load-balancer-controller is installed +# - aws Gateway API controller is installed (gateway.networking.k8s.io) +# - IRSA configured for the dograh ServiceAccount when using S3 +# +# REQUIRED OVERRIDES: +# --set secrets.ossJwtSecret=$(openssl rand -hex 32) +# --set secrets.turnSecret=$(openssl rand -hex 32) +# --set exposure.gatewayApi.listenerHostname=dograh.example.com +# --set storage.s3.bucket=... +# +# After install, retrieve coturn NLB address and re-upgrade: +# LB_IP=$(kubectl get svc dograh-coturn -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') +# helm upgrade dograh . --reuse-values --set coturn.externalIp=$LB_IP --set config.turnHost=$LB_IP + +database: + mode: external # use RDS Postgres +redis: + mode: external # use ElastiCache Redis +storage: + mode: s3 + s3: + region: us-east-1 + bucket: "" # set via --set + +exposure: + mode: gatewayApi + gatewayApi: + createGateway: true + gatewayClassName: aws-alb + listenerHostname: "" # set via --set + + ingress: + tls: + enabled: true + secretName: "" # cert ARN via ALB annotations instead; see below + +# coturn on NLB. AWS Gateway API only handles L7; coturn keeps a plain +# Service of type LoadBalancer with NLB annotations. +coturn: + service: + type: LoadBalancer + externalTrafficPolicy: Local + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + +# IRSA: bind a role with S3 permissions to the dograh ServiceAccount. +serviceAccount: + create: true + annotations: + eks.amazonaws.com/role-arn: "" # set via --set + +web: + replicaCount: 3 + +autoscaling: + web: + enabled: true + minReplicas: 3 + maxReplicas: 12 + +# No bundled deps: the in-chart Postgres/Redis/MinIO manifests are gated on the +# internal modes, so the external/S3 modes above already keep them from rendering. diff --git a/deploy/helm/dograh/examples/values-managed.yaml b/deploy/helm/dograh/examples/values-managed.yaml new file mode 100644 index 00000000..cdb55bc1 --- /dev/null +++ b/deploy/helm/dograh/examples/values-managed.yaml @@ -0,0 +1,68 @@ +# Production: external Postgres + Redis, S3 storage, Gateway API exposure. +# Suitable for managed Kubernetes (EKS, GKE, AKS) with managed DBs. +# +# REQUIRED OVERRIDES at install time: +# --set secrets.databaseUrl=... +# --set secrets.redisUrl=... +# --set secrets.ossJwtSecret=$(openssl rand -hex 32) +# --set secrets.turnSecret=$(openssl rand -hex 32) +# --set exposure.gatewayApi.gatewayClassName= +# --set exposure.gatewayApi.listenerHostname=dograh.example.com + +database: + mode: external +redis: + mode: external +storage: + mode: s3 + s3: + region: us-east-1 + bucket: dograh-voice-audio + publicEndpoint: https://dograh-voice-audio.s3.amazonaws.com +exposure: + mode: gatewayApi + gatewayApi: + createGateway: true + # gatewayClassName MUST be set via --set or override file. + listenerHostname: "" # set to your hostname + ingress: + tls: + enabled: true + secretName: dograh-tls + +config: + environment: production + logLevel: INFO + enableAwsS3: true + +web: + replicaCount: 3 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: "4" + memory: 4Gi + +workers: + replicaCount: 2 + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: "2" + memory: 2Gi + +autoscaling: + web: + enabled: true + minReplicas: 3 + maxReplicas: 12 + targetCPUUtilizationPercentage: 70 + +# No bundled deps here: the in-chart Postgres/Redis/MinIO manifests are gated on +# the internal modes (database.mode=internal, redis.mode=internal, +# storage.mode=internalMinio), so the external/S3 modes set above already keep +# them from rendering. diff --git a/deploy/helm/dograh/examples/values-single-node.yaml b/deploy/helm/dograh/examples/values-single-node.yaml new file mode 100644 index 00000000..b0f293f3 --- /dev/null +++ b/deploy/helm/dograh/examples/values-single-node.yaml @@ -0,0 +1,59 @@ +# Single-node deployment (k3s, minikube, single VM). +# All stateful deps bundled in-cluster, Ingress for HTTP, smaller resources. + +database: + mode: internal +redis: + mode: internal +storage: + mode: internalMinio +exposure: + mode: ingress + ingress: + className: nginx + host: dograh.local + +config: + environment: production + logLevel: INFO + +web: + replicaCount: 1 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: "1" + memory: 1Gi + pdb: + enabled: false + +workers: + replicaCount: 1 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + +ui: + replicaCount: 1 + pdb: + enabled: false + +autoscaling: + web: + enabled: false + +postgresql: + persistence: + size: 2Gi +redisinternal: + persistence: + size: 1Gi +minio: + persistence: + size: 5Gi diff --git a/deploy/helm/dograh/templates/NOTES.txt b/deploy/helm/dograh/templates/NOTES.txt new file mode 100644 index 00000000..ab74de81 --- /dev/null +++ b/deploy/helm/dograh/templates/NOTES.txt @@ -0,0 +1,92 @@ +Dograh has been installed. + +Release: {{ .Release.Name }} +Namespace: {{ .Release.Namespace }} +Chart: {{ .Chart.Name }}-{{ .Chart.Version }} + +=== HTTP exposure ({{ .Values.exposure.mode }}) === +{{- if eq .Values.exposure.mode "gatewayApi" }} +{{- if .Values.exposure.gatewayApi.createGateway }} +A Gateway named {{ include "dograh.fullname" . }} was created with class +"{{ .Values.exposure.gatewayApi.gatewayClassName }}". Find its address with: + + kubectl get gateway {{ include "dograh.fullname" . }} -n {{ .Release.Namespace }} \ + -o jsonpath='{.status.addresses[0].value}' +{{- else }} +HTTPRoutes were attached to existing Gateway(s): +{{- range .Values.exposure.gatewayApi.parentRefs }} + - {{ .name }}{{ if .namespace }}/{{ .namespace }}{{ end }} +{{- end }} +{{- end }} +{{- else }} +Ingress class: "{{ .Values.exposure.ingress.className }}" +Host: {{ default "(unset — set exposure.ingress.host)" .Values.exposure.ingress.host }} +{{- end }} + +=== TURN (coturn) === +{{- if .Values.coturn.enabled }} +The coturn Service is type LoadBalancer. Find its external address: + + kubectl get svc {{ include "dograh.coturn.fullname" . }} -n {{ .Release.Namespace }} \ + -o jsonpath='{.status.loadBalancer.ingress[0].ip}{.status.loadBalancer.ingress[0].hostname}' + +IMPORTANT — chicken-and-egg with coturn.externalIp: +coturn announces an external IP in ICE candidates. The LoadBalancer IP is +typically not known until after install. Once the LB has an address: + + helm upgrade {{ .Release.Name }} . \ + --reuse-values \ + --set coturn.externalIp= \ + --set config.turnHost= + +Until then, WebRTC media will be impaired in relay-only scenarios. + +NLB listener-quota note (AWS): the default coturn relay range is +{{ .Values.coturn.relayPortRange.min }}-{{ .Values.coturn.relayPortRange.max }}, which is +{{ sub (int .Values.coturn.relayPortRange.max) (int .Values.coturn.relayPortRange.min) | add1 }} ports. +AWS NLB default quota is 50 listeners per LB. Widening the range requires either +a quota increase or splitting TURN across multiple LBs. +{{- else }} +coturn is disabled. Set coturn.enabled=true to deploy media relay. +{{- end }} + +=== Migrations === +{{- if .Values.migrate.enabled }} +Alembic migrations run as a post-install / pre-upgrade hook. Inspect with: + + kubectl logs job/{{ include "dograh.migrate.fullname" . }} -n {{ .Release.Namespace }} +{{- end }} + +=== /tmp shared volume === +{{- if .Values.sharedTmp.enabled }} +sharedTmp.enabled=true — web and arq-worker pods mount a ReadWriteMany PVC +at {{ .Values.sharedTmp.mountPath }} so end-of-call uploads survive pod boundaries. +{{- else }} +WARNING: sharedTmp.enabled=false. End-of-call uploads (event_handlers.py → +ARQ s3_upload) hand off via /tmp paths. With separated web and worker pods +this WILL fail unless you have an RWX storage class configured. + +See deploy/helm/dograh/README.md "/tmp audit" section. +{{- end }} + +=== Singletons === +ari-manager and campaign-orchestrator run with replicas=1 and +strategy=Recreate by design. Do NOT scale these via kubectl scale — +they use in-memory locks and would silently corrupt with >1 replica. + +=== Required overrides === +{{- if eq .Values.secrets.ossJwtSecret "ChangeMeInProduction" }} +WARNING: secrets.ossJwtSecret is still the chart default. Override before +running in any non-dev environment: + + --set secrets.ossJwtSecret="$(openssl rand -hex 32)" +{{- end }} +{{- if and (eq .Values.database.mode "external") (empty .Values.secrets.databaseUrl) }} +ERROR: database.mode=external but secrets.databaseUrl is empty. +{{- end }} +{{- if and (eq .Values.redis.mode "external") (empty .Values.secrets.redisUrl) }} +ERROR: redis.mode=external but secrets.redisUrl is empty. +{{- end }} + +For troubleshooting and topology examples see +deploy/helm/dograh/README.md and examples/. diff --git a/deploy/helm/dograh/templates/_helpers.tpl b/deploy/helm/dograh/templates/_helpers.tpl new file mode 100644 index 00000000..d8469848 --- /dev/null +++ b/deploy/helm/dograh/templates/_helpers.tpl @@ -0,0 +1,247 @@ +{{/* +Common helpers. +*/}} + +{{- define "dograh.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{- define "dograh.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{- define "dograh.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{- define "dograh.labels" -}} +helm.sh/chart: {{ include "dograh.chart" . }} +{{ include "dograh.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{- define "dograh.selectorLabels" -}} +app.kubernetes.io/name: {{ include "dograh.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{- define "dograh.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "dograh.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Component-specific names. +*/}} +{{- define "dograh.web.fullname" -}}{{ include "dograh.fullname" . }}-web{{- end }} +{{- define "dograh.arqWorker.fullname" -}}{{ include "dograh.fullname" . }}-arq-worker{{- end }} +{{- define "dograh.ariManager.fullname" -}}{{ include "dograh.fullname" . }}-ari-manager{{- end }} +{{- define "dograh.campaignOrchestrator.fullname" -}}{{ include "dograh.fullname" . }}-campaign-orchestrator{{- end }} +{{- define "dograh.ui.fullname" -}}{{ include "dograh.fullname" . }}-ui{{- end }} +{{- define "dograh.coturn.fullname" -}}{{ include "dograh.fullname" . }}-coturn{{- end }} +{{- define "dograh.migrate.fullname" -}}{{ include "dograh.fullname" . }}-migrate{{- end }} + +{{- define "dograh.configMapName" -}}{{ include "dograh.fullname" . }}-config{{- end }} +{{- define "dograh.secretName" -}} +{{- if .Values.secrets.existingSecret -}} +{{- .Values.secrets.existingSecret -}} +{{- else -}} +{{- include "dograh.fullname" . }}-secret +{{- end -}} +{{- end }} + +{{/* +Image reference. +*/}} +{{- define "dograh.image" -}} +{{- $registry := .Values.image.registry | default "docker.io" -}} +{{- printf "%s/%s:%s" $registry .Values.image.repository .Values.image.tag -}} +{{- end }} + +{{- define "dograh.ui.image" -}} +{{- $registry := .Values.ui.image.registry | default "docker.io" -}} +{{- printf "%s/%s:%s" $registry .Values.ui.image.repository .Values.ui.image.tag -}} +{{- end }} + +{{- define "dograh.coturn.image" -}} +{{- $registry := .Values.coturn.image.registry | default "docker.io" -}} +{{- printf "%s/%s:%s" $registry .Values.coturn.image.repository .Values.coturn.image.tag -}} +{{- end }} + +{{/* +Subchart enabling — flips top-level chart-dependency `enabled` flags from mode. +Called from each template via `include "dograh.deps.resolved" .` (no-op output). +*/}} +{{- define "dograh.deps.resolved" -}} +{{- /* compute whether internal deps are enabled */ -}} +{{- end }} + +{{/* +In-cluster service references for internal deps. +*/}} +{{- define "dograh.postgresHost" -}}{{ .Release.Name }}-postgresql{{- end }} +{{- define "dograh.redisHost" -}}{{ .Release.Name }}-redisinternal-master{{- end }} +{{- define "dograh.minioHost" -}}{{ .Release.Name }}-minio{{- end }} + +{{/* +Resolved passwords for the bundled internal deps. +Precedence: explicit value in values.yaml wins; else reuse the value already +stored in the Secret (so `helm upgrade` does NOT rotate the password and desync a +running datastore); else generate a fresh one. `lookup` returns empty during +`helm template` (no cluster), so dry renders get a throwaway random value — fine. +Each is materialized in exactly one place (the dep's Secret); every other +reference is a secretKeyRef, so the generated value is stable within a render. +*/}} +{{- define "dograh.postgresPassword" -}} +{{- if .Values.postgresql.auth.password -}} +{{- .Values.postgresql.auth.password -}} +{{- else -}} +{{- $s := lookup "v1" "Secret" .Release.Namespace (printf "%s-postgresql" .Release.Name) -}} +{{- if and $s $s.data (index $s.data "password") -}} +{{- index $s.data "password" | b64dec -}} +{{- else -}} +{{- randAlphaNum 24 -}} +{{- end -}} +{{- end -}} +{{- end }} + +{{- define "dograh.redisPassword" -}} +{{- if .Values.redisinternal.auth.password -}} +{{- .Values.redisinternal.auth.password -}} +{{- else -}} +{{- $s := lookup "v1" "Secret" .Release.Namespace (printf "%s-redisinternal" .Release.Name) -}} +{{- if and $s $s.data (index $s.data "redis-password") -}} +{{- index $s.data "redis-password" | b64dec -}} +{{- else -}} +{{- randAlphaNum 24 -}} +{{- end -}} +{{- end -}} +{{- end }} + +{{- define "dograh.minioRootPassword" -}} +{{- if .Values.minio.auth.rootPassword -}} +{{- .Values.minio.auth.rootPassword -}} +{{- else -}} +{{- $s := lookup "v1" "Secret" .Release.Namespace (printf "%s-minio" .Release.Name) -}} +{{- if and $s $s.data (index $s.data "root-password") -}} +{{- index $s.data "root-password" | b64dec -}} +{{- else -}} +{{- randAlphaNum 24 -}} +{{- end -}} +{{- end -}} +{{- end }} + +{{/* +Default DATABASE_URL when database.mode=internal. +The bundled Postgres (templates/internal-postgres.yaml) stores the app-user +password in the -postgresql Secret under key `password`; dograh.dbEnv +projects it into $(POSTGRES_PASSWORD), which this URL interpolates at runtime. +Auth username/database default to `dograh` (see values.postgresql.auth). +*/}} +{{- define "dograh.databaseUrl" -}} +{{- if eq .Values.database.mode "internal" -}} +postgresql+asyncpg://{{ .Values.postgresql.auth.username }}:$(POSTGRES_PASSWORD)@{{ include "dograh.postgresHost" . }}:5432/{{ .Values.postgresql.auth.database }} +{{- else -}} +$(DATABASE_URL) +{{- end -}} +{{- end }} + +{{- define "dograh.redisUrl" -}} +{{- if eq .Values.redis.mode "internal" -}} +redis://:$(REDIS_PASSWORD)@{{ include "dograh.redisHost" . }}:6379 +{{- else -}} +$(REDIS_URL) +{{- end -}} +{{- end }} + +{{/* +Database / Redis connection env for backend workloads (web, arq, singletons, +migrate). + +ORDER IS LOAD-BEARING. POSTGRES_PASSWORD / REDIS_PASSWORD are declared BEFORE +DATABASE_URL / REDIS_URL because Kubernetes only expands a $(VAR) reference to +an env var defined *earlier* in the same container's env list — a forward +reference is left as the literal string "$(VAR)". DATABASE_URL / REDIS_URL +embed $(POSTGRES_PASSWORD) / $(REDIS_PASSWORD) (see dograh.databaseUrl / +dograh.redisUrl), so the password vars must come first or the composed URLs +ship with a literal "$(POSTGRES_PASSWORD)" as the password. +*/}} +{{- define "dograh.dbEnv" -}} +{{- if eq .Values.database.mode "internal" }} +- name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-postgresql + key: password +{{- end }} +{{- if eq .Values.redis.mode "internal" }} +- name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-redisinternal + key: redis-password +{{- end }} +- name: DATABASE_URL + value: {{ include "dograh.databaseUrl" . | quote }} +- name: REDIS_URL + value: {{ include "dograh.redisUrl" . | quote }} +{{- if eq .Values.storage.mode "internalMinio" }} +{{- /* Internal MinIO creds come from the -minio secret, the same + source the MinIO server uses (no ordering constraint — no composition). */}} +- name: MINIO_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-minio + key: root-user +- name: MINIO_SECRET_KEY + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-minio + key: root-password +{{- end }} +{{- end }} + +{{/* +Common env block for backend workloads (web, arq, singletons, migrate). +References the ConfigMap + Secret via envFrom. DATABASE_URL and REDIS_URL +are added inline because they may need composition from subchart secrets. +*/}} +{{- define "dograh.backendEnvFrom" -}} +- configMapRef: + name: {{ include "dograh.configMapName" . }} +- secretRef: + name: {{ include "dograh.secretName" . }} +{{- end }} + +{{/* +Volume mounts for the shared-tmp PVC when enabled. +*/}} +{{- define "dograh.sharedTmpVolumeMounts" -}} +{{- if .Values.sharedTmp.enabled }} +- name: shared-tmp + mountPath: {{ .Values.sharedTmp.mountPath }} +{{- end }} +{{- end }} + +{{- define "dograh.sharedTmpVolumes" -}} +{{- if .Values.sharedTmp.enabled }} +- name: shared-tmp + persistentVolumeClaim: + claimName: {{ include "dograh.fullname" . }}-shared-tmp +{{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/ari-manager-deployment.yaml b/deploy/helm/dograh/templates/ari-manager-deployment.yaml new file mode 100644 index 00000000..9bc83252 --- /dev/null +++ b/deploy/helm/dograh/templates/ari-manager-deployment.yaml @@ -0,0 +1,64 @@ +{{- if .Values.ariManager.enabled }} +# SINGLETON — replicas hard-coded to 1, strategy: Recreate. +# ari-manager maintains an outbound WebSocket to Asterisk and is the +# canonical receiver of ARI events. Running >1 replica produces duplicate +# event handling. There is NO replica knob on purpose. Add leader +# election before relaxing this constraint. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "dograh.ariManager.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: ari-manager +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: ari-manager + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: ari-manager + {{- with .Values.ariManager.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "dograh.serviceAccountName" . }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: ari-manager + image: {{ include "dograh.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["./scripts/run_ari_manager.sh"] + envFrom: + {{- include "dograh.backendEnvFrom" . | nindent 12 }} + env: + {{- include "dograh.dbEnv" . | nindent 12 }} + # exec probe — no HTTP endpoint exists on ari-manager. + livenessProbe: + {{- toYaml .Values.ariManager.livenessProbe | nindent 12 }} + resources: + {{- toYaml .Values.ariManager.resources | nindent 12 }} + {{- with .Values.ariManager.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.ariManager.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.ariManager.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/arq-worker-deployment.yaml b/deploy/helm/dograh/templates/arq-worker-deployment.yaml new file mode 100644 index 00000000..4af2fc5a --- /dev/null +++ b/deploy/helm/dograh/templates/arq-worker-deployment.yaml @@ -0,0 +1,63 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "dograh.arqWorker.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: arq-worker +spec: + replicas: {{ .Values.workers.replicaCount }} + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: arq-worker + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: arq-worker + {{- with .Values.workers.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "dograh.serviceAccountName" . }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: arq-worker + image: {{ include "dograh.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["./scripts/run_arq_worker.sh"] + envFrom: + {{- include "dograh.backendEnvFrom" . | nindent 12 }} + env: + {{- include "dograh.dbEnv" . | nindent 12 }} + livenessProbe: + {{- toYaml .Values.workers.livenessProbe | nindent 12 }} + resources: + {{- toYaml .Values.workers.resources | nindent 12 }} + volumeMounts: + {{- include "dograh.sharedTmpVolumeMounts" . | nindent 12 }} + volumes: + {{- include "dograh.sharedTmpVolumes" . | nindent 8 }} + {{- with .Values.workers.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.workers.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.workers.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/deploy/helm/dograh/templates/campaign-orchestrator-deployment.yaml b/deploy/helm/dograh/templates/campaign-orchestrator-deployment.yaml new file mode 100644 index 00000000..8e1ef2c9 --- /dev/null +++ b/deploy/helm/dograh/templates/campaign-orchestrator-deployment.yaml @@ -0,0 +1,64 @@ +{{- if .Values.campaignOrchestrator.enabled }} +# SINGLETON — replicas hard-coded to 1, strategy: Recreate. +# campaign_orchestrator uses in-memory deduplication locks +# (`_processing_locks`); running >1 replica would silently break +# scheduling. Same singleton rules as ari-manager: no replica knob, +# Recreate strategy. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "dograh.campaignOrchestrator.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: campaign-orchestrator +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: campaign-orchestrator + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: campaign-orchestrator + {{- with .Values.campaignOrchestrator.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "dograh.serviceAccountName" . }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: campaign-orchestrator + image: {{ include "dograh.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["./scripts/run_campaign_orchestrator.sh"] + envFrom: + {{- include "dograh.backendEnvFrom" . | nindent 12 }} + env: + {{- include "dograh.dbEnv" . | nindent 12 }} + # exec probe — no HTTP endpoint exists on campaign-orchestrator. + livenessProbe: + {{- toYaml .Values.campaignOrchestrator.livenessProbe | nindent 12 }} + resources: + {{- toYaml .Values.campaignOrchestrator.resources | nindent 12 }} + {{- with .Values.campaignOrchestrator.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.campaignOrchestrator.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.campaignOrchestrator.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/configmap.yaml b/deploy/helm/dograh/templates/configmap.yaml new file mode 100644 index 00000000..6502d61b --- /dev/null +++ b/deploy/helm/dograh/templates/configmap.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "dograh.configMapName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} +data: + ENVIRONMENT: {{ .Values.config.environment | quote }} + LOG_LEVEL: {{ .Values.config.logLevel | quote }} + BACKEND_API_ENDPOINT: {{ .Values.config.backendApiEndpoint | quote }} + MINIO_BUCKET: {{ .Values.config.minioBucket | quote }} + MINIO_SECURE: {{ .Values.config.minioSecure | quote }} + ENABLE_AWS_S3: {{ ternary "true" "false" (eq .Values.storage.mode "s3") | quote }} + ENABLE_TELEMETRY: {{ .Values.config.enableTelemetry | quote }} + POSTHOG_HOST: {{ .Values.config.posthogHost | quote }} + POSTHOG_API_KEY: {{ .Values.config.posthogApiKey | quote }} + FORCE_TURN_RELAY: {{ .Values.config.forceTurnRelay | quote }} + TURN_HOST: {{ .Values.config.turnHost | quote }} + FASTAPI_WORKERS: {{ .Values.config.fastapiWorkers | quote }} + {{- /* MinIO endpoints derived from storage mode. */ -}} + {{- if eq .Values.storage.mode "internalMinio" }} + MINIO_ENDPOINT: {{ printf "%s:9000" (include "dograh.minioHost" .) | quote }} + MINIO_PUBLIC_ENDPOINT: {{ .Values.config.minioPublicEndpoint | quote }} + {{- else if eq .Values.storage.mode "externalMinio" }} + MINIO_ENDPOINT: {{ .Values.storage.externalMinio.endpoint | quote }} + MINIO_PUBLIC_ENDPOINT: {{ .Values.storage.externalMinio.publicEndpoint | quote }} + {{- else if eq .Values.storage.mode "s3" }} + AWS_REGION: {{ .Values.storage.s3.region | quote }} + MINIO_PUBLIC_ENDPOINT: {{ .Values.storage.s3.publicEndpoint | quote }} + {{- end }} + {{- /* TURN external IP visible to the web service for credential issuance. */ -}} + {{- if .Values.coturn.enabled }} + TURN_EXTERNAL_IP: {{ .Values.coturn.externalIp | quote }} + {{- end }} + {{- if .Values.secrets.langfuseHost }} + LANGFUSE_HOST: {{ .Values.secrets.langfuseHost | quote }} + {{- end }} diff --git a/deploy/helm/dograh/templates/coturn-configmap.yaml b/deploy/helm/dograh/templates/coturn-configmap.yaml new file mode 100644 index 00000000..9148d4a2 --- /dev/null +++ b/deploy/helm/dograh/templates/coturn-configmap.yaml @@ -0,0 +1,37 @@ +{{- if .Values.coturn.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "dograh.coturn.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: coturn +data: + turnserver.conf: | + # Auto-generated by the Dograh Helm chart. + listening-port={{ .Values.coturn.ports.plain }} + tls-listening-port={{ .Values.coturn.ports.tls }} + + min-port={{ .Values.coturn.relayPortRange.min }} + max-port={{ .Values.coturn.relayPortRange.max }} + + {{- if .Values.coturn.externalIp }} + external-ip={{ .Values.coturn.externalIp }} + {{- else }} + # external-ip not yet set. Run: + # helm upgrade {{ .Release.Name }} . --reuse-values --set coturn.externalIp= + # once the LoadBalancer Service has an address. + {{- end }} + + realm={{ .Values.coturn.realm }} + + use-auth-secret + static-auth-secret=$(TURN_SECRET) + + fingerprint + no-cli + no-multicast-peers + + log-file=stdout +{{- end }} diff --git a/deploy/helm/dograh/templates/coturn-deployment.yaml b/deploy/helm/dograh/templates/coturn-deployment.yaml new file mode 100644 index 00000000..16d958c2 --- /dev/null +++ b/deploy/helm/dograh/templates/coturn-deployment.yaml @@ -0,0 +1,98 @@ +{{- if .Values.coturn.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "dograh.coturn.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: coturn +spec: + # coturn is a singleton (per-LB instance). HA TURN requires a separate + # design (multiple LBs or anycast). + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: coturn + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: coturn + {{- with .Values.coturn.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + annotations: + # Re-roll coturn when turnserver.conf changes. + checksum/config: {{ include (print $.Template.BasePath "/coturn-configmap.yaml") . | sha256sum }} + spec: + serviceAccountName: {{ include "dograh.serviceAccountName" . }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: coturn + image: {{ include "dograh.coturn.image" . }} + imagePullPolicy: {{ .Values.coturn.image.pullPolicy }} + # coturn doesn't expand env vars in its config file; we render + # via envsubst at start so static-auth-secret can come from a + # Kubernetes Secret instead of being baked into the ConfigMap. + command: + - sh + - -c + - | + set -e + apk add --no-cache gettext >/dev/null 2>&1 || true + envsubst < /etc/coturn-template/turnserver.conf > /tmp/turnserver.conf + exec turnserver -c /tmp/turnserver.conf + env: + - name: TURN_SECRET + valueFrom: + secretKeyRef: + name: {{ include "dograh.secretName" . }} + key: TURN_SECRET + ports: + - name: turn-udp + containerPort: {{ .Values.coturn.ports.plain }} + protocol: UDP + - name: turn-tcp + containerPort: {{ .Values.coturn.ports.plain }} + protocol: TCP + - name: turns-udp + containerPort: {{ .Values.coturn.ports.tls }} + protocol: UDP + - name: turns-tcp + containerPort: {{ .Values.coturn.ports.tls }} + protocol: TCP + resources: + {{- toYaml .Values.coturn.resources | nindent 12 }} + volumeMounts: + - name: config + mountPath: /etc/coturn-template + readOnly: true + - name: tmp + mountPath: /tmp + volumes: + - name: config + configMap: + name: {{ include "dograh.coturn.fullname" . }} + - name: tmp + emptyDir: {} + {{- with .Values.coturn.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.coturn.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.coturn.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/coturn-service.yaml b/deploy/helm/dograh/templates/coturn-service.yaml new file mode 100644 index 00000000..a5034638 --- /dev/null +++ b/deploy/helm/dograh/templates/coturn-service.yaml @@ -0,0 +1,47 @@ +{{- if .Values.coturn.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "dograh.coturn.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: coturn + {{- with .Values.coturn.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.coturn.service.type }} + {{- if eq .Values.coturn.service.type "LoadBalancer" }} + externalTrafficPolicy: {{ .Values.coturn.service.externalTrafficPolicy }} + {{- end }} + ports: + - name: turn-udp + port: {{ .Values.coturn.ports.plain }} + targetPort: turn-udp + protocol: UDP + - name: turn-tcp + port: {{ .Values.coturn.ports.plain }} + targetPort: turn-tcp + protocol: TCP + - name: turns-udp + port: {{ .Values.coturn.ports.tls }} + targetPort: turns-udp + protocol: UDP + - name: turns-tcp + port: {{ .Values.coturn.ports.tls }} + targetPort: turns-tcp + protocol: TCP + # Relay range ports. AWS NLB has a default 50-listener cap; widening + # the range past that requires either a quota bump or multiple NLBs. + {{- range $port := untilStep (int .Values.coturn.relayPortRange.min) (int (add1 (int .Values.coturn.relayPortRange.max))) 1 }} + - name: relay-{{ $port }} + port: {{ $port }} + targetPort: {{ $port }} + protocol: UDP + {{- end }} + selector: + {{- include "dograh.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: coturn +{{- end }} diff --git a/deploy/helm/dograh/templates/gateway.yaml b/deploy/helm/dograh/templates/gateway.yaml new file mode 100644 index 00000000..24a6b622 --- /dev/null +++ b/deploy/helm/dograh/templates/gateway.yaml @@ -0,0 +1,34 @@ +{{- if and (eq .Values.exposure.mode "gatewayApi") .Values.exposure.gatewayApi.createGateway }} +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: {{ include "dograh.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} +spec: + gatewayClassName: {{ required "exposure.gatewayApi.gatewayClassName is required when createGateway=true" .Values.exposure.gatewayApi.gatewayClassName }} + listeners: + - name: http + port: 80 + protocol: HTTP + {{- if .Values.exposure.gatewayApi.listenerHostname }} + hostname: {{ .Values.exposure.gatewayApi.listenerHostname | quote }} + {{- end }} + allowedRoutes: + namespaces: + from: Same + - name: https + port: 443 + protocol: HTTPS + {{- if .Values.exposure.gatewayApi.listenerHostname }} + hostname: {{ .Values.exposure.gatewayApi.listenerHostname | quote }} + {{- end }} + tls: + mode: Terminate + certificateRefs: + - name: {{ default (printf "%s-tls" (include "dograh.fullname" .)) .Values.exposure.ingress.tls.secretName }} + allowedRoutes: + namespaces: + from: Same +{{- end }} diff --git a/deploy/helm/dograh/templates/httproute-api.yaml b/deploy/helm/dograh/templates/httproute-api.yaml new file mode 100644 index 00000000..8e8a2922 --- /dev/null +++ b/deploy/helm/dograh/templates/httproute-api.yaml @@ -0,0 +1,33 @@ +{{- if eq .Values.exposure.mode "gatewayApi" }} +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: {{ include "dograh.fullname" . }}-api + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} +spec: + parentRefs: + {{- if .Values.exposure.gatewayApi.createGateway }} + - name: {{ include "dograh.fullname" . }} + {{- else }} + {{- range .Values.exposure.gatewayApi.parentRefs }} + - name: {{ .name }} + {{- if .namespace }} + namespace: {{ .namespace }} + {{- end }} + {{- end }} + {{- end }} + {{- if .Values.exposure.gatewayApi.listenerHostname }} + hostnames: + - {{ .Values.exposure.gatewayApi.listenerHostname | quote }} + {{- end }} + rules: + - matches: + - path: + type: PathPrefix + value: /api/ + backendRefs: + - name: {{ include "dograh.web.fullname" . }} + port: {{ .Values.web.service.port }} +{{- end }} diff --git a/deploy/helm/dograh/templates/httproute-minio.yaml b/deploy/helm/dograh/templates/httproute-minio.yaml new file mode 100644 index 00000000..5630fe2b --- /dev/null +++ b/deploy/helm/dograh/templates/httproute-minio.yaml @@ -0,0 +1,46 @@ +{{- /* +Browser-visible MinIO route. Mounted under the shared API/UI hostname at +/voice-audio/* to mirror the existing nginx pass-through. Operators who +want a dedicated hostname can edit this template. +*/ -}} +{{- if and (eq .Values.exposure.mode "gatewayApi") (or (eq .Values.storage.mode "internalMinio") (eq .Values.storage.mode "externalMinio")) }} +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: {{ include "dograh.fullname" . }}-minio + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} +spec: + parentRefs: + {{- if .Values.exposure.gatewayApi.createGateway }} + - name: {{ include "dograh.fullname" . }} + {{- else }} + {{- range .Values.exposure.gatewayApi.parentRefs }} + - name: {{ .name }} + {{- if .namespace }} + namespace: {{ .namespace }} + {{- end }} + {{- end }} + {{- end }} + {{- if .Values.exposure.gatewayApi.listenerHostname }} + hostnames: + - {{ .Values.exposure.gatewayApi.listenerHostname | quote }} + {{- end }} + rules: + - matches: + - path: + type: PathPrefix + value: /voice-audio/ + backendRefs: + {{- if eq .Values.storage.mode "internalMinio" }} + - name: {{ include "dograh.minioHost" . }} + port: 9000 + {{- else }} + # externalMinio: route to a placeholder Service named + # -minio-external. Operators must create this Service of + # type ExternalName pointing at storage.externalMinio.endpoint. + - name: {{ include "dograh.fullname" . }}-minio-external + port: 9000 + {{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/httproute-ui.yaml b/deploy/helm/dograh/templates/httproute-ui.yaml new file mode 100644 index 00000000..23b170df --- /dev/null +++ b/deploy/helm/dograh/templates/httproute-ui.yaml @@ -0,0 +1,33 @@ +{{- if and (eq .Values.exposure.mode "gatewayApi") .Values.ui.enabled }} +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: {{ include "dograh.fullname" . }}-ui + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} +spec: + parentRefs: + {{- if .Values.exposure.gatewayApi.createGateway }} + - name: {{ include "dograh.fullname" . }} + {{- else }} + {{- range .Values.exposure.gatewayApi.parentRefs }} + - name: {{ .name }} + {{- if .namespace }} + namespace: {{ .namespace }} + {{- end }} + {{- end }} + {{- end }} + {{- if .Values.exposure.gatewayApi.listenerHostname }} + hostnames: + - {{ .Values.exposure.gatewayApi.listenerHostname | quote }} + {{- end }} + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - name: {{ include "dograh.ui.fullname" . }} + port: {{ .Values.ui.service.port }} +{{- end }} diff --git a/deploy/helm/dograh/templates/ingress.yaml b/deploy/helm/dograh/templates/ingress.yaml new file mode 100644 index 00000000..e06850cf --- /dev/null +++ b/deploy/helm/dograh/templates/ingress.yaml @@ -0,0 +1,66 @@ +{{- if eq .Values.exposure.mode "ingress" }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "dograh.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + annotations: + # Sensible defaults for WebSocket-heavy signaling traffic. These are + # nginx-ingress style; if you use a different controller, override + # via exposure.ingress.annotations. + nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-body-size: "100m" + {{- with .Values.exposure.ingress.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.exposure.ingress.className }} + ingressClassName: {{ .Values.exposure.ingress.className | quote }} + {{- end }} + {{- if .Values.exposure.ingress.tls.enabled }} + tls: + - hosts: + - {{ required "exposure.ingress.host is required when tls.enabled=true" .Values.exposure.ingress.host | quote }} + secretName: {{ required "exposure.ingress.tls.secretName is required when tls.enabled=true" .Values.exposure.ingress.tls.secretName | quote }} + {{- end }} + rules: + - {{- if .Values.exposure.ingress.host }} + host: {{ .Values.exposure.ingress.host | quote }} + {{- end }} + http: + paths: + - path: /api/ + pathType: Prefix + backend: + service: + name: {{ include "dograh.web.fullname" . }} + port: + number: {{ .Values.web.service.port }} + {{- if or (eq .Values.storage.mode "internalMinio") (eq .Values.storage.mode "externalMinio") }} + - path: /voice-audio/ + pathType: Prefix + backend: + service: + {{- if eq .Values.storage.mode "internalMinio" }} + name: {{ include "dograh.minioHost" . }} + {{- else }} + # externalMinio: requires an ExternalName Service named + # -minio-external pointing at storage.externalMinio.endpoint. + name: {{ include "dograh.fullname" . }}-minio-external + {{- end }} + port: + number: 9000 + {{- end }} + {{- if .Values.ui.enabled }} + - path: / + pathType: Prefix + backend: + service: + name: {{ include "dograh.ui.fullname" . }} + port: + number: {{ .Values.ui.service.port }} + {{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/internal-minio.yaml b/deploy/helm/dograh/templates/internal-minio.yaml new file mode 100644 index 00000000..f3760462 --- /dev/null +++ b/deploy/helm/dograh/templates/internal-minio.yaml @@ -0,0 +1,137 @@ +{{- if eq .Values.storage.mode "internalMinio" }} +{{- /* +Bundled MinIO for storage.mode=internalMinio, official upstream image. Single +replica (Recreate strategy — the RWO volume can't be shared). The app creates +its bucket on first use (bucket_exists → make_bucket), so no provisioning Job. + +The root credentials live in -minio and are consumed BOTH by the MinIO +server (MINIO_ROOT_USER / MINIO_ROOT_PASSWORD) and by the app (MINIO_ACCESS_KEY / +MINIO_SECRET_KEY, wired in dograh.dbEnv) — one source of truth, so they can't +drift. For production use storage.mode=s3 or externalMinio. +*/}} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Release.Name }}-minio + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: minio +type: Opaque +stringData: + root-user: {{ .Values.minio.auth.rootUser | default "minioadmin" | quote }} + root-password: {{ include "dograh.minioRootPassword" . | quote }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ .Release.Name }}-minio + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: minio +spec: + type: ClusterIP + ports: + - name: api + port: 9000 + targetPort: api + - name: console + port: 9001 + targetPort: console + selector: + {{- include "dograh.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: minio +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Release.Name }}-minio + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: minio +spec: + accessModes: ["ReadWriteOnce"] + {{- if .Values.minio.persistence.storageClass }} + storageClassName: {{ .Values.minio.persistence.storageClass | quote }} + {{- end }} + resources: + requests: + storage: {{ .Values.minio.persistence.size | quote }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Release.Name }}-minio + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: minio +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: minio + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: minio + spec: + securityContext: + fsGroup: 1000 + containers: + - name: minio + image: {{ printf "%s/%s:%s" (.Values.minio.image.registry | default "docker.io") .Values.minio.image.repository (.Values.minio.image.tag | toString) | quote }} + imagePullPolicy: {{ .Values.minio.image.pullPolicy | default "IfNotPresent" }} + args: + - server + - /data + - --console-address + - ":9001" + env: + - name: MINIO_ROOT_USER + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-minio + key: root-user + - name: MINIO_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-minio + key: root-password + ports: + - name: api + containerPort: 9000 + - name: console + containerPort: 9001 + readinessProbe: + httpGet: + path: /minio/health/ready + port: api + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + livenessProbe: + httpGet: + path: /minio/health/live + port: api + initialDelaySeconds: 20 + periodSeconds: 15 + timeoutSeconds: 5 + failureThreshold: 6 + resources: + {{- toYaml .Values.minio.resources | nindent 12 }} + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + persistentVolumeClaim: + claimName: {{ .Release.Name }}-minio +{{- end }} diff --git a/deploy/helm/dograh/templates/internal-postgres.yaml b/deploy/helm/dograh/templates/internal-postgres.yaml new file mode 100644 index 00000000..46a83ecf --- /dev/null +++ b/deploy/helm/dograh/templates/internal-postgres.yaml @@ -0,0 +1,118 @@ +{{- if eq .Values.database.mode "internal" }} +{{- /* +Bundled PostgreSQL for database.mode=internal. + +Image is the official pgvector build — upstream postgres plus the `vector` +extension the app's migrations require (CREATE EXTENSION vector). POSTGRES_USER +is the initdb superuser, so the migration can create the (untrusted) extension. + +This is the convenience/all-in-one path. For production, set database.mode=external +and point secrets.databaseUrl at a managed Postgres. +*/}} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Release.Name }}-postgresql + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: postgresql +type: Opaque +stringData: + password: {{ include "dograh.postgresPassword" . | quote }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ .Release.Name }}-postgresql + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: postgresql +spec: + type: ClusterIP + ports: + - name: postgresql + port: 5432 + targetPort: postgresql + selector: + {{- include "dograh.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: postgresql +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ .Release.Name }}-postgresql + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: postgresql +spec: + serviceName: {{ .Release.Name }}-postgresql + replicas: 1 + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: postgresql + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: postgresql + spec: + securityContext: + # Official image starts as root, chowns PGDATA, then drops to the + # `postgres` uid (999). fsGroup makes the mounted volume group-writable. + fsGroup: 999 + containers: + - name: postgresql + image: {{ printf "%s/%s:%s" (.Values.postgresql.image.registry | default "docker.io") .Values.postgresql.image.repository (.Values.postgresql.image.tag | toString) | quote }} + imagePullPolicy: {{ .Values.postgresql.image.pullPolicy | default "IfNotPresent" }} + ports: + - name: postgresql + containerPort: 5432 + env: + - name: POSTGRES_USER + value: {{ .Values.postgresql.auth.username | quote }} + - name: POSTGRES_DB + value: {{ .Values.postgresql.auth.database | quote }} + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-postgresql + key: password + # Init into a subdirectory so a mounted volume's lost+found does not + # collide with initdb's empty-dir check. + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + readinessProbe: + exec: + command: ["pg_isready", "-U", {{ .Values.postgresql.auth.username | quote }}, "-d", {{ .Values.postgresql.auth.database | quote }}] + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + livenessProbe: + exec: + command: ["pg_isready", "-U", {{ .Values.postgresql.auth.username | quote }}, "-d", {{ .Values.postgresql.auth.database | quote }}] + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 5 + failureThreshold: 6 + resources: + {{- toYaml .Values.postgresql.resources | nindent 12 }} + volumeMounts: + - name: data + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + {{- if .Values.postgresql.persistence.storageClass }} + storageClassName: {{ .Values.postgresql.persistence.storageClass | quote }} + {{- end }} + resources: + requests: + storage: {{ .Values.postgresql.persistence.size | quote }} +{{- end }} diff --git a/deploy/helm/dograh/templates/internal-redis.yaml b/deploy/helm/dograh/templates/internal-redis.yaml new file mode 100644 index 00000000..c3f4caa5 --- /dev/null +++ b/deploy/helm/dograh/templates/internal-redis.yaml @@ -0,0 +1,115 @@ +{{- if eq .Values.redis.mode "internal" }} +{{- /* +Bundled Redis for redis.mode=internal, official upstream image. Single-node, +password-protected, AOF persistence. For production set redis.mode=external and +point secrets.redisUrl at a managed Redis/Valkey. + +Service name is -redisinternal-master (kept from the previous packaging) +so dograh.redisHost / dograh.redisUrl need no change. +*/}} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Release.Name }}-redisinternal + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: redis +type: Opaque +stringData: + redis-password: {{ include "dograh.redisPassword" . | quote }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ .Release.Name }}-redisinternal-master + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: redis +spec: + type: ClusterIP + ports: + - name: redis + port: 6379 + targetPort: redis + selector: + {{- include "dograh.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: redis +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ .Release.Name }}-redisinternal-master + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: redis +spec: + serviceName: {{ .Release.Name }}-redisinternal-master + replicas: 1 + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: redis + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: redis + spec: + securityContext: + runAsUser: 999 + runAsGroup: 999 + fsGroup: 999 + containers: + - name: redis + image: {{ printf "%s/%s:%s" (.Values.redisinternal.image.registry | default "docker.io") .Values.redisinternal.image.repository (.Values.redisinternal.image.tag | toString) | quote }} + imagePullPolicy: {{ .Values.redisinternal.image.pullPolicy | default "IfNotPresent" }} + command: ["redis-server"] + # $(REDIS_PASSWORD) is expanded by Kubernetes from the env var below. + args: + - "--requirepass" + - "$(REDIS_PASSWORD)" + - "--appendonly" + - "yes" + env: + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-redisinternal + key: redis-password + ports: + - name: redis + containerPort: 6379 + readinessProbe: + exec: + command: ["sh", "-c", 'redis-cli -a "$REDIS_PASSWORD" ping | grep -q PONG'] + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + livenessProbe: + exec: + command: ["sh", "-c", 'redis-cli -a "$REDIS_PASSWORD" ping | grep -q PONG'] + initialDelaySeconds: 20 + periodSeconds: 15 + timeoutSeconds: 5 + failureThreshold: 6 + resources: + {{- toYaml .Values.redisinternal.resources | nindent 12 }} + volumeMounts: + - name: data + mountPath: /data + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + {{- if .Values.redisinternal.persistence.storageClass }} + storageClassName: {{ .Values.redisinternal.persistence.storageClass | quote }} + {{- end }} + resources: + requests: + storage: {{ .Values.redisinternal.persistence.size | quote }} +{{- end }} diff --git a/deploy/helm/dograh/templates/migrate-job.yaml b/deploy/helm/dograh/templates/migrate-job.yaml new file mode 100644 index 00000000..5183a2b8 --- /dev/null +++ b/deploy/helm/dograh/templates/migrate-job.yaml @@ -0,0 +1,68 @@ +{{- if .Values.migrate.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "dograh.migrate.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: migrate + annotations: + # post-install (not pre-install): when the bundled Postgres subchart is in + # use it is a normal resource, so on a fresh install it does not exist until + # after pre-install hooks finish — a pre-install migration would deadlock + # waiting for a database Helm has not created yet. post-install runs after + # all resources (incl. Postgres) are applied. pre-upgrade is kept so + # migrations land before new app code rolls out on upgrades (where the DB + # already exists). + "helm.sh/hook": post-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: 0 + activeDeadlineSeconds: {{ .Values.migrate.activeDeadlineSeconds }} + ttlSecondsAfterFinished: 86400 + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: migrate + spec: + serviceAccountName: {{ include "dograh.serviceAccountName" . }} + restartPolicy: Never + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if eq .Values.database.mode "internal" }} + # post-install fires right after resources are applied — the bundled + # Postgres pod may still be starting. Block until its Service routes to a + # ready endpoint (endpoints only publish once the pod passes readiness) + # before alembic attempts to connect. Reuses the app image (already + # pulled for the migrate container) so no extra image is needed. + initContainers: + - name: wait-for-postgres + image: {{ include "dograh.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - bash + - -c + - | + until (exec 3<>/dev/tcp/{{ include "dograh.postgresHost" . }}/5432) 2>/dev/null; do + echo "waiting for postgres at {{ include "dograh.postgresHost" . }}:5432 ..." + sleep 2 + done + echo "postgres is reachable" + {{- end }} + containers: + - name: migrate + image: {{ include "dograh.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["./scripts/run_migrate.sh"] + envFrom: + {{- include "dograh.backendEnvFrom" . | nindent 12 }} + env: + {{- include "dograh.dbEnv" . | nindent 12 }} + resources: + {{- toYaml .Values.migrate.resources | nindent 12 }} +{{- end }} diff --git a/deploy/helm/dograh/templates/secret.yaml b/deploy/helm/dograh/templates/secret.yaml new file mode 100644 index 00000000..01bda8a1 --- /dev/null +++ b/deploy/helm/dograh/templates/secret.yaml @@ -0,0 +1,35 @@ +{{- if not .Values.secrets.existingSecret }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "dograh.secretName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} +type: Opaque +stringData: + OSS_JWT_SECRET: {{ required "secrets.ossJwtSecret is required" .Values.secrets.ossJwtSecret | quote }} + TURN_SECRET: {{ .Values.secrets.turnSecret | quote }} + {{- if eq .Values.database.mode "external" }} + DATABASE_URL: {{ required "secrets.databaseUrl is required when database.mode=external" .Values.secrets.databaseUrl | quote }} + {{- end }} + {{- if eq .Values.redis.mode "external" }} + REDIS_URL: {{ required "secrets.redisUrl is required when redis.mode=external" .Values.secrets.redisUrl | quote }} + {{- end }} + {{- /* internalMinio creds are sourced from the -minio secret via + dograh.dbEnv; only externalMinio pulls creds from here. */}} + {{- if eq .Values.storage.mode "externalMinio" }} + MINIO_ACCESS_KEY: {{ .Values.secrets.minioAccessKey | quote }} + MINIO_SECRET_KEY: {{ .Values.secrets.minioSecretKey | quote }} + {{- end }} + {{- if eq .Values.storage.mode "s3" }} + {{- if .Values.secrets.awsAccessKeyId }} + AWS_ACCESS_KEY_ID: {{ .Values.secrets.awsAccessKeyId | quote }} + AWS_SECRET_ACCESS_KEY: {{ .Values.secrets.awsSecretAccessKey | quote }} + {{- end }} + {{- end }} + {{- if .Values.secrets.langfuseSecretKey }} + LANGFUSE_SECRET_KEY: {{ .Values.secrets.langfuseSecretKey | quote }} + LANGFUSE_PUBLIC_KEY: {{ .Values.secrets.langfusePublicKey | quote }} + {{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/serviceaccount.yaml b/deploy/helm/dograh/templates/serviceaccount.yaml new file mode 100644 index 00000000..7331cada --- /dev/null +++ b/deploy/helm/dograh/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "dograh.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/shared-tmp-pvc.yaml b/deploy/helm/dograh/templates/shared-tmp-pvc.yaml new file mode 100644 index 00000000..5a3546f0 --- /dev/null +++ b/deploy/helm/dograh/templates/shared-tmp-pvc.yaml @@ -0,0 +1,18 @@ +{{- if .Values.sharedTmp.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "dograh.fullname" . }}-shared-tmp + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} +spec: + accessModes: + - ReadWriteMany + {{- if .Values.sharedTmp.storageClassName }} + storageClassName: {{ .Values.sharedTmp.storageClassName | quote }} + {{- end }} + resources: + requests: + storage: {{ .Values.sharedTmp.size }} +{{- end }} diff --git a/deploy/helm/dograh/templates/ui-deployment.yaml b/deploy/helm/dograh/templates/ui-deployment.yaml new file mode 100644 index 00000000..9282220d --- /dev/null +++ b/deploy/helm/dograh/templates/ui-deployment.yaml @@ -0,0 +1,80 @@ +{{- if .Values.ui.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "dograh.ui.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: ui +spec: + replicas: {{ .Values.ui.replicaCount }} + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: ui + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: ui + {{- with .Values.ui.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "dograh.serviceAccountName" . }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: ui + image: {{ include "dograh.ui.image" . }} + imagePullPolicy: {{ .Values.ui.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.ui.port }} + protocol: TCP + env: + # Next.js standalone binds to $HOSTNAME (falls back to 0.0.0.0). + # Kubernetes injects HOSTNAME=, which /etc/hosts maps to the + # pod IP — so without this, Next listens on the pod IP ONLY. In-cluster + # Service traffic still works, but loopback (kubectl port-forward, local + # health checks) is refused. Pin to 0.0.0.0 to listen on all interfaces. + - name: HOSTNAME + value: "0.0.0.0" + - name: NODE_ENV + value: "oss" + - name: BACKEND_URL + value: {{ default (printf "http://%s:%d" (include "dograh.web.fullname" .) (int .Values.web.service.port)) .Values.ui.backendUrl | quote }} + - name: ENABLE_TELEMETRY + value: {{ .Values.config.enableTelemetry | quote }} + - name: POSTHOG_KEY + value: {{ .Values.config.posthogApiKey | quote }} + - name: POSTHOG_HOST + value: {{ .Values.config.posthogHost | quote }} + livenessProbe: + {{- toYaml .Values.ui.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.ui.readinessProbe | nindent 12 }} + resources: + {{- toYaml .Values.ui.resources | nindent 12 }} + {{- with .Values.ui.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.ui.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.ui.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/ui-pdb.yaml b/deploy/helm/dograh/templates/ui-pdb.yaml new file mode 100644 index 00000000..6d1b74db --- /dev/null +++ b/deploy/helm/dograh/templates/ui-pdb.yaml @@ -0,0 +1,16 @@ +{{- if and .Values.ui.enabled .Values.ui.pdb.enabled }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "dograh.ui.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: ui +spec: + minAvailable: {{ .Values.ui.pdb.minAvailable }} + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: ui +{{- end }} diff --git a/deploy/helm/dograh/templates/ui-service.yaml b/deploy/helm/dograh/templates/ui-service.yaml new file mode 100644 index 00000000..6e807f79 --- /dev/null +++ b/deploy/helm/dograh/templates/ui-service.yaml @@ -0,0 +1,20 @@ +{{- if .Values.ui.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "dograh.ui.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: ui +spec: + type: {{ .Values.ui.service.type }} + ports: + - name: http + port: {{ .Values.ui.service.port }} + targetPort: http + protocol: TCP + selector: + {{- include "dograh.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: ui +{{- end }} diff --git a/deploy/helm/dograh/templates/web-deployment.yaml b/deploy/helm/dograh/templates/web-deployment.yaml new file mode 100644 index 00000000..c4829d56 --- /dev/null +++ b/deploy/helm/dograh/templates/web-deployment.yaml @@ -0,0 +1,90 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "dograh.web.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: web +spec: + {{- if not .Values.autoscaling.web.enabled }} + replicas: {{ .Values.web.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: web + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + {{- include "dograh.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: web + {{- with .Values.web.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "dograh.serviceAccountName" . }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + # Long-lived signaling WebSockets keep state in-process; honor the + # configured drain window so in-flight calls survive a rolling + # update. See README "Decisions log". + terminationGracePeriodSeconds: {{ .Values.web.terminationGracePeriodSeconds }} + containers: + - name: web + image: {{ include "dograh.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["./scripts/run_web.sh"] + ports: + - name: http + containerPort: {{ .Values.web.port }} + protocol: TCP + envFrom: + {{- include "dograh.backendEnvFrom" . | nindent 12 }} + env: + - name: WEB_PORT + value: {{ .Values.web.port | quote }} + {{- include "dograh.dbEnv" . | nindent 12 }} + # Distinct probes: readiness flips fast (drain), liveness is + # slower (process aliveness). + livenessProbe: + {{- toYaml .Values.web.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.web.readinessProbe | nindent 12 }} + lifecycle: + preStop: + # Sleep so the gateway / load balancer observes the pod + # NotReady and stops sending new connections before SIGTERM + # propagates to uvicorn. + exec: + command: ["sh", "-c", "sleep {{ .Values.web.preStopSleepSeconds }}"] + resources: + {{- toYaml .Values.web.resources | nindent 12 }} + volumeMounts: + {{- include "dograh.sharedTmpVolumeMounts" . | nindent 12 }} + volumes: + {{- include "dograh.sharedTmpVolumes" . | nindent 8 }} + {{- with .Values.web.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.web.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.web.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.web.topologySpreadConstraints }} + topologySpreadConstraints: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/deploy/helm/dograh/templates/web-hpa.yaml b/deploy/helm/dograh/templates/web-hpa.yaml new file mode 100644 index 00000000..cdc04868 --- /dev/null +++ b/deploy/helm/dograh/templates/web-hpa.yaml @@ -0,0 +1,38 @@ +{{- if .Values.autoscaling.web.enabled }} +# WARNING: CPU/memory is a poor signal for WebRTC signaling. WebSockets +# are long-lived and low-CPU; this HPA will not respond to connection +# pressure. Replace with a custom metric (active connections, active +# calls) as soon as one is exposed. +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "dograh.web.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: web +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "dograh.web.fullname" . }} + minReplicas: {{ .Values.autoscaling.web.minReplicas }} + maxReplicas: {{ .Values.autoscaling.web.maxReplicas }} + metrics: + {{- if .Values.autoscaling.web.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.web.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.web.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.web.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/deploy/helm/dograh/templates/web-pdb.yaml b/deploy/helm/dograh/templates/web-pdb.yaml new file mode 100644 index 00000000..39d74b1a --- /dev/null +++ b/deploy/helm/dograh/templates/web-pdb.yaml @@ -0,0 +1,16 @@ +{{- if .Values.web.pdb.enabled }} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "dograh.web.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: web +spec: + minAvailable: {{ .Values.web.pdb.minAvailable }} + selector: + matchLabels: + {{- include "dograh.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: web +{{- end }} diff --git a/deploy/helm/dograh/templates/web-service.yaml b/deploy/helm/dograh/templates/web-service.yaml new file mode 100644 index 00000000..159d96bf --- /dev/null +++ b/deploy/helm/dograh/templates/web-service.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "dograh.web.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "dograh.labels" . | nindent 4 }} + app.kubernetes.io/component: web + {{- with .Values.web.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.web.service.type }} + ports: + - name: http + port: {{ .Values.web.service.port }} + targetPort: http + protocol: TCP + selector: + {{- include "dograh.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: web diff --git a/deploy/helm/dograh/values.schema.json b/deploy/helm/dograh/values.schema.json new file mode 100644 index 00000000..e2b746cd --- /dev/null +++ b/deploy/helm/dograh/values.schema.json @@ -0,0 +1,48 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Dograh Helm chart values", + "type": "object", + "properties": { + "database": { + "type": "object", + "properties": { + "mode": { + "type": "string", + "enum": ["internal", "external"] + } + }, + "required": ["mode"] + }, + "redis": { + "type": "object", + "properties": { + "mode": { + "type": "string", + "enum": ["internal", "external"] + } + }, + "required": ["mode"] + }, + "storage": { + "type": "object", + "properties": { + "mode": { + "type": "string", + "enum": ["internalMinio", "externalMinio", "s3"] + } + }, + "required": ["mode"] + }, + "exposure": { + "type": "object", + "properties": { + "mode": { + "type": "string", + "enum": ["gatewayApi", "ingress"] + } + }, + "required": ["mode"] + } + }, + "required": ["database", "redis", "storage", "exposure"] +} diff --git a/deploy/helm/dograh/values.yaml b/deploy/helm/dograh/values.yaml new file mode 100644 index 00000000..2cb09b9c --- /dev/null +++ b/deploy/helm/dograh/values.yaml @@ -0,0 +1,554 @@ +# Dograh Helm chart — default values. +# +# Conventions: +# - "mode" fields are enums; see values.schema.json for allowed values. +# - Anything sensitive (passwords, tokens, signing keys) is split into the +# `secrets:` section and rendered as a Kubernetes Secret. Non-sensitive +# config lives in `config:` and renders as a ConfigMap. +# - The chart never ships real defaults for credentials. Operators must +# override `secrets.*` (or supply an existing Secret name). + +# ----------------------------------------------------------------------------- +# Global image config — applied to web, workers, ariManager, campaignOrchestrator +# ----------------------------------------------------------------------------- +image: + registry: docker.io + repository: dograhai/dograh-api + tag: latest + pullPolicy: IfNotPresent + +imagePullSecrets: [] +# - name: regcred + +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + create: true + name: "" + annotations: {} + +# ----------------------------------------------------------------------------- +# Stateful dependency modes. +# +# database.mode: +# internal — bundled Postgres manifest (pgvector image; see `postgresql` below) +# external — operator supplies DATABASE_URL via secrets.databaseUrl +# redis.mode: +# internal — bundled Redis manifest (see `redisinternal` below) +# external — operator supplies REDIS_URL via secrets.redisUrl +# storage.mode: +# internalMinio — bundled MinIO manifest (see `minio` below) +# externalMinio — operator supplies a MinIO-compatible endpoint + creds +# s3 — sets ENABLE_AWS_S3=true; uses AWS S3 +# exposure.mode: +# gatewayApi — renders Gateway + HTTPRoute (gateway.networking.k8s.io/v1) +# ingress — renders Ingress resources (networking.k8s.io/v1) +# ----------------------------------------------------------------------------- +database: + mode: internal + # For external mode, secrets.databaseUrl must be set. + +redis: + mode: internal + # For external mode, secrets.redisUrl must be set. + +storage: + mode: internalMinio + # For externalMinio mode, set externalMinio.endpoint + secrets.minioAccessKey + # + secrets.minioSecretKey. + externalMinio: + endpoint: "" # e.g. minio.example.com + publicEndpoint: "" # browser-visible URL + secure: false + bucket: voice-audio + # For s3 mode, set s3.region. AWS credentials are picked up from the pod's + # IAM role (IRSA recommended) or from secrets.awsAccessKeyId + secrets.awsSecretAccessKey. + s3: + region: us-east-1 + bucket: voice-audio + publicEndpoint: "" # e.g. https://s3.amazonaws.com + +exposure: + # Default is `ingress` because it works out-of-the-box on any cluster + # without requiring Gateway API CRDs. Production deployments should + # prefer `gatewayApi` per HELM_DEPLOYMENT_PLAN.md — switch the mode + # and supply gatewayClassName. + mode: ingress + # Gateway API config (when mode=gatewayApi). + gatewayApi: + # Set to false to skip rendering the Gateway resource and instead + # attach HTTPRoutes to a pre-existing Gateway (parentRef.name below). + createGateway: true + gatewayClassName: "" # required when createGateway=true (e.g. "istio", "envoy-gateway", "aws-alb") + listenerHostname: "" # optional SNI hostname for the listener; empty = wildcard + # Reference an existing Gateway instead of creating one. + # Ignored when createGateway=true. + parentRefs: + - name: dograh + namespace: "" # empty = same namespace as the release + # Ingress config (when mode=ingress). + ingress: + className: "" # e.g. "nginx", "alb" + annotations: {} + # Hostname for the API/UI. UI is served at / and API under /api/. + # MinIO browser-visible path uses the same hostname under /voice-audio/. + host: "" # e.g. dograh.example.com + tls: + enabled: false + secretName: "" # operator-managed TLS secret in the release namespace + +# ----------------------------------------------------------------------------- +# Non-sensitive runtime config — rendered into a ConfigMap and injected via +# envFrom on every backend pod. Sensitive values live under `secrets:` below. +# ----------------------------------------------------------------------------- +config: + environment: production + logLevel: INFO + backendApiEndpoint: "" # public URL the browser uses to reach the API; auto-derived from exposure.host if empty in NOTES + minioBucket: voice-audio + minioEndpoint: "" # internal cluster endpoint (auto-set when internalMinio) + minioPublicEndpoint: "" # browser-visible endpoint (auto-set when ingress/gateway path exposes MinIO) + minioSecure: false + enableAwsS3: false + enableTelemetry: true + posthogHost: https://us.i.posthog.com + posthogApiKey: phc_ItizB1dP6yv7ZYobbcqrpxTdbomDA8hJFSEmAMdYvIr + forceTurnRelay: false + turnHost: "" # public hostname/IP of coturn (the LoadBalancer address) + fastapiWorkers: 1 # informational only; web tier scales by pod, not in-pod workers + +# ----------------------------------------------------------------------------- +# Secrets — rendered into a Kubernetes Secret unless secrets.existingSecret is +# set. NEVER commit real values here; override via -f overrides.yaml or +# --set-string at install time. +# ----------------------------------------------------------------------------- +secrets: + # If set, the chart skips rendering its own Secret and assumes this Secret + # already exists in the release namespace with all keys below. + existingSecret: "" + + # Required when database.mode=external. + databaseUrl: "" # e.g. postgresql+asyncpg://user:pass@host:5432/dograh + # Required when redis.mode=external. + redisUrl: "" # e.g. redis://:pass@host:6379 + + # MinIO / S3 credentials. + minioAccessKey: "" + minioSecretKey: "" + awsAccessKeyId: "" # only used when storage.mode=s3 and not using IRSA + awsSecretAccessKey: "" + + # JWT signing key for the OSS auth path. MUST be overridden in production. + ossJwtSecret: "ChangeMeInProduction" + + # TURN REST API shared secret (matches coturn.staticAuthSecret below). + turnSecret: "" + + # Optional Langfuse tracing. + langfuseSecretKey: "" + langfusePublicKey: "" + langfuseHost: "" + +# ----------------------------------------------------------------------------- +# Shared /tmp PVC. +# +# AUDIT FINDING: api/services/pipecat/event_handlers.py writes WAV/transcript +# tempfiles in the web process and enqueues an ARQ job that reads those exact +# paths in the worker (api/tasks/s3_upload.py). In compose this works because +# all processes share the `shared-tmp` volume. In Kubernetes web and worker run +# in separate pods. Options: +# 1. Enable this PVC (ReadWriteMany required) to mount /tmp/dograh-shared +# into both web and arq-worker pods. Use this for v1. +# 2. Refactor event_handlers.py to upload from the web process and pass a +# storage key (not a local path) to the ARQ job. Preferred long-term; +# see deploy/helm/dograh/README.md "Open TODOs". +# If your cluster lacks RWX (most cloud default storage classes are RWO), +# you MUST take option (2) before splitting web and worker pods, or end-of- +# call uploads will fail silently. +sharedTmp: + enabled: false + storageClassName: "" # must be an RWX-capable class (e.g. efs-sc, azurefile, longhorn-rwx) + size: 10Gi + mountPath: /tmp/dograh-shared + +# ----------------------------------------------------------------------------- +# Web tier (FastAPI + WebSocket signaling) +# ----------------------------------------------------------------------------- +web: + replicaCount: 2 + port: 8000 + + # Long-lived signaling WebSockets keep per-connection state in process + # memory (api/routes/webrtc_signaling.py). A naive pod restart drops every + # in-flight call. The two settings below give the gateway time to stop + # routing new connections to a terminating pod and give in-flight calls + # time to finish. + terminationGracePeriodSeconds: 600 + # preStop sleep: long enough for the load balancer to observe the pod going + # NotReady and stop sending new connections. 15s is conservative for most + # controllers (gateway/nginx/ALB). + preStopSleepSeconds: 15 + + resources: + # These are conservative starting numbers. Tune to your workload — + # WebRTC signaling is mostly idle but bursty during call setup. + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: "2" + memory: 2Gi + + # Distinct probes so the pod can fail readiness during drain without being + # killed for liveness. liveness has a longer threshold (process is alive) + # while readiness flips quickly (stop receiving new connections). + livenessProbe: + httpGet: + path: /api/v1/health + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 6 + readinessProbe: + httpGet: + path: /api/v1/health + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 2 + + service: + type: ClusterIP + port: 8000 + annotations: {} + + pdb: + enabled: true + minAvailable: 1 + + podAnnotations: {} + nodeSelector: {} + tolerations: [] + # Recommend spreading web pods across zones / nodes. + topologySpreadConstraints: [] + affinity: {} + +# ----------------------------------------------------------------------------- +# ARQ background workers +# ----------------------------------------------------------------------------- +workers: + replicaCount: 1 + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: "1" + memory: 1Gi + + # exec probe — workers have no HTTP endpoint. + livenessProbe: + exec: + # The entrypoint exec's the worker, so it runs as PID 1; grep its cmdline. + # Avoids pgrep/procps, which isn't in the slim runtime image. Matching a + # single argv token (no spaces) — argv is NUL-separated in /proc. + command: ["sh", "-c", "grep -qa api.tasks.arq.WorkerSettings /proc/1/cmdline"] + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 3 + + podAnnotations: {} + nodeSelector: {} + tolerations: [] + affinity: {} + +# ----------------------------------------------------------------------------- +# ARI manager — TELEPHONY SINGLETON +# +# Maintains an outbound WebSocket to Asterisk and is the canonical receiver of +# ARI events. Running >1 replica produces duplicate event handling. The chart +# hard-codes replicas:1 and strategy:Recreate; there is NO replica knob here +# on purpose. Add proper leader election before relaxing this. +# ----------------------------------------------------------------------------- +ariManager: + enabled: true + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + livenessProbe: + exec: + # PID 1 cmdline check (procps-free; see workers.livenessProbe). + command: ["sh", "-c", "grep -qa api.services.telephony.ari_manager /proc/1/cmdline"] + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 3 + + podAnnotations: {} + nodeSelector: {} + tolerations: [] + affinity: {} + +# ----------------------------------------------------------------------------- +# Campaign orchestrator — CAMPAIGN SINGLETON +# +# Uses in-memory deduplication locks (api/services/campaign/campaign_orchestrator.py +# `_processing_locks`). Running >1 replica would silently break scheduling. +# Same singleton rules as ariManager: no replica knob, Recreate strategy. +# ----------------------------------------------------------------------------- +campaignOrchestrator: + enabled: true + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + livenessProbe: + exec: + # PID 1 cmdline check (procps-free; see workers.livenessProbe). + command: ["sh", "-c", "grep -qa api.services.campaign.campaign_orchestrator /proc/1/cmdline"] + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 3 + + podAnnotations: {} + nodeSelector: {} + tolerations: [] + affinity: {} + +# ----------------------------------------------------------------------------- +# Next.js UI +# ----------------------------------------------------------------------------- +ui: + enabled: true + replicaCount: 2 + + image: + registry: docker.io + repository: dograhai/dograh-ui + tag: latest + pullPolicy: IfNotPresent + + port: 3010 + + # Server-side (SSR) URL. Defaults to the in-cluster web Service. + backendUrl: "" # auto-set in template when empty + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + livenessProbe: + httpGet: + path: / + port: 3010 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: / + port: 3010 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 2 + + service: + type: ClusterIP + port: 3010 + + pdb: + enabled: true + minAvailable: 1 + + podAnnotations: {} + nodeSelector: {} + tolerations: [] + affinity: {} + +# ----------------------------------------------------------------------------- +# coturn — TURN media relay +# ----------------------------------------------------------------------------- +coturn: + enabled: true + + image: + registry: docker.io + repository: coturn/coturn + tag: "4.8.0" + pullPolicy: IfNotPresent + + # External IP advertised by coturn for NAT traversal. This is the + # LoadBalancer IP of the coturn Service. There is a chicken-and-egg here: + # the LB IP may not be known until after install. See NOTES.txt for the + # supported workflow (install with placeholder, kubectl get svc, helm + # upgrade --set coturn.externalIp=). + externalIp: "" + + realm: dograh.com + + # Coturn uses TURN REST API authentication (HMAC-SHA1). The secret here + # MUST match secrets.turnSecret — the chart will warn at install time if + # they diverge. + staticAuthSecretFromSecretsKey: turnSecret + + # Relay port range. AWS NLB has a default quota of 50 listeners per LB, + # so the default 49 ports (49152-49200) sits just inside the limit. + # Increasing this requires either a higher NLB listener quota or + # additional TURN deployments. + relayPortRange: + min: 49152 + max: 49200 + + # Standard TURN ports. + ports: + plain: 3478 + tls: 5349 + + # TLS for turns:// — NOT WIRED IN v1. The original docker-compose exposes + # 5349 but does not configure cert paths. v1 scopes to plain TURN over + # UDP/TCP. See README.md "Open TODOs". + tls: + enabled: false + + service: + type: LoadBalancer + annotations: {} + # externalTrafficPolicy: Local preserves the client IP, which TURN auth + # benefits from. Some LBs need this set to "Cluster" to be reachable. + externalTrafficPolicy: Local + + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: "2" + memory: 1Gi + + podAnnotations: {} + nodeSelector: {} + tolerations: [] + affinity: {} + +# ----------------------------------------------------------------------------- +# Migration Job +# ----------------------------------------------------------------------------- +migrate: + # Run alembic upgrade head as a Helm hook: post-install (so the bundled + # Postgres exists first) and pre-upgrade (so migrations land before new code). + enabled: true + + # Hard cap on how long a migration may run. A failed/exceeded migration + # rolls back the install/upgrade because backoffLimit is 0. + activeDeadlineSeconds: 600 + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +# ----------------------------------------------------------------------------- +# Horizontal Pod Autoscaling — web tier only. +# +# WARNING: CPU/memory is a poor signal for WebRTC signaling workloads. +# WebSockets are long-lived, low-CPU, and steady-memory; CPU will look flat +# while you saturate per-pod connection limits. Replace this with a custom +# metric (active WS connections, active calls) once one is exposed. +# ----------------------------------------------------------------------------- +autoscaling: + web: + enabled: false + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + +# ----------------------------------------------------------------------------- +# Bundled stateful deps for the internal/all-in-one modes. These are plain +# in-chart manifests (templates/internal-*.yaml) on official upstream images — +# not subcharts. Each renders only in its internal mode: +# postgresql -> database.mode == internal +# redisinternal-> redis.mode == internal +# minio -> storage.mode == internalMinio +# For production, use the external/managed modes instead (see database/redis/ +# storage above). +# ----------------------------------------------------------------------------- +postgresql: + # Official pgvector image = upstream Postgres + the `vector` extension the app + # migrations require. Keep the tag on a supported Postgres major. + image: + registry: docker.io + repository: pgvector/pgvector + tag: pg17 + pullPolicy: IfNotPresent + auth: + username: dograh # also the initdb superuser (needed for CREATE EXTENSION) + password: "" # auto-generated and persisted across upgrades if empty + database: dograh + persistence: + size: 8Gi + storageClass: "" # "" = cluster default StorageClass + resources: + requests: { cpu: 100m, memory: 256Mi } + limits: { cpu: "1", memory: 1Gi } + +redisinternal: + # Official upstream Redis. Single node, password-protected, AOF persistence. + # (Key kept as `redisinternal` so it doesn't collide with `redis.mode` above.) + image: + registry: docker.io + repository: redis + tag: 7.4-alpine + pullPolicy: IfNotPresent + auth: + password: "" # auto-generated and persisted across upgrades if empty + persistence: + size: 8Gi + storageClass: "" + resources: + requests: { cpu: 50m, memory: 64Mi } + limits: { cpu: 500m, memory: 256Mi } + +minio: + # Official upstream MinIO. Root creds are shared with the app (single source of + # truth) so they can't drift; the app creates its bucket on first use. + image: + registry: docker.io + repository: minio/minio + tag: RELEASE.2025-04-22T22-12-26Z + pullPolicy: IfNotPresent + auth: + rootUser: minioadmin + rootPassword: "" # auto-generated and persisted across upgrades if empty + persistence: + size: 20Gi + storageClass: "" + resources: + requests: { cpu: 100m, memory: 256Mi } + limits: { cpu: "1", memory: 1Gi } diff --git a/scripts/run_ari_manager.sh b/scripts/run_ari_manager.sh new file mode 100755 index 00000000..95b459a1 --- /dev/null +++ b/scripts/run_ari_manager.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)" +ENV_FILE="$BASE_DIR/api/.env" + +if [[ -f "$ENV_FILE" ]]; then + set -a && . "$ENV_FILE" && set +a +fi + +cd "$BASE_DIR" +exec python -m api.services.telephony.ari_manager diff --git a/scripts/run_arq_worker.sh b/scripts/run_arq_worker.sh new file mode 100755 index 00000000..abe0e2b5 --- /dev/null +++ b/scripts/run_arq_worker.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)" +ENV_FILE="$BASE_DIR/api/.env" + +if [[ -f "$ENV_FILE" ]]; then + set -a && . "$ENV_FILE" && set +a +fi + +cd "$BASE_DIR" +exec python -m arq api.tasks.arq.WorkerSettings --custom-log-dict api.tasks.arq.LOG_CONFIG diff --git a/scripts/run_campaign_orchestrator.sh b/scripts/run_campaign_orchestrator.sh new file mode 100755 index 00000000..a8ed3b37 --- /dev/null +++ b/scripts/run_campaign_orchestrator.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)" +ENV_FILE="$BASE_DIR/api/.env" + +if [[ -f "$ENV_FILE" ]]; then + set -a && . "$ENV_FILE" && set +a +fi + +cd "$BASE_DIR" +exec python -m api.services.campaign.campaign_orchestrator diff --git a/scripts/run_migrate.sh b/scripts/run_migrate.sh new file mode 100755 index 00000000..904b22f4 --- /dev/null +++ b/scripts/run_migrate.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)" +ENV_FILE="$BASE_DIR/api/.env" + +if [[ -f "$ENV_FILE" ]]; then + set -a && . "$ENV_FILE" && set +a +fi + +cd "$BASE_DIR" +exec alembic -c "$BASE_DIR/api/alembic.ini" upgrade head diff --git a/scripts/run_web.sh b/scripts/run_web.sh new file mode 100755 index 00000000..913eb73a --- /dev/null +++ b/scripts/run_web.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)" +ENV_FILE="$BASE_DIR/api/.env" + +if [[ -f "$ENV_FILE" ]]; then + set -a && . "$ENV_FILE" && set +a +fi + +PORT="${WEB_PORT:-8000}" + +cd "$BASE_DIR" +exec uvicorn api.app:app --host 0.0.0.0 --port "$PORT" --workers 1