dograh/deploy/helm/dograh/values.yaml
Abhishek Kumar 0d59ae776c feat: add Helm chart for Kubernetes deployment
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 18:19:35 +05:30

527 lines
17 KiB
YAML

# Dograh Helm chart — default values.
#
# Conventions:
# - "mode" fields are enums; see values.schema.json for allowed values.
# - Anything sensitive (passwords, tokens, signing keys) is split into the
# `secrets:` section and rendered as a Kubernetes Secret. Non-sensitive
# config lives in `config:` and renders as a ConfigMap.
# - The chart never ships real defaults for credentials. Operators must
# override `secrets.*` (or supply an existing Secret name).
# -----------------------------------------------------------------------------
# Global image config — applied to web, workers, ariManager, campaignOrchestrator
# -----------------------------------------------------------------------------
image:
registry: docker.io
repository: dograhai/dograh-api
tag: latest
pullPolicy: IfNotPresent
imagePullSecrets: []
# - name: regcred
nameOverride: ""
fullnameOverride: ""
serviceAccount:
create: true
name: ""
annotations: {}
# -----------------------------------------------------------------------------
# Stateful dependency modes.
#
# database.mode:
# internal — bundled Bitnami PostgreSQL subchart (postgresql.enabled=true)
# external — operator supplies DATABASE_URL via secrets.databaseUrl
# redis.mode:
# internal — bundled Bitnami Redis subchart (redis.enabled=true)
# external — operator supplies REDIS_URL via secrets.redisUrl
# storage.mode:
# internalMinio — bundled MinIO subchart (minio.enabled=true)
# externalMinio — operator supplies a MinIO-compatible endpoint + creds
# s3 — sets ENABLE_AWS_S3=true; uses AWS S3
# exposure.mode:
# gatewayApi — renders Gateway + HTTPRoute (gateway.networking.k8s.io/v1)
# ingress — renders Ingress resources (networking.k8s.io/v1)
# -----------------------------------------------------------------------------
database:
mode: internal
# For external mode, secrets.databaseUrl must be set.
redis:
mode: internal
# For external mode, secrets.redisUrl must be set.
storage:
mode: internalMinio
# For externalMinio mode, set externalMinio.endpoint + secrets.minioAccessKey
# + secrets.minioSecretKey.
externalMinio:
endpoint: "" # e.g. minio.example.com
publicEndpoint: "" # browser-visible URL
secure: false
bucket: voice-audio
# For s3 mode, set s3.region. AWS credentials are picked up from the pod's
# IAM role (IRSA recommended) or from secrets.awsAccessKeyId + secrets.awsSecretAccessKey.
s3:
region: us-east-1
bucket: voice-audio
publicEndpoint: "" # e.g. https://s3.amazonaws.com
exposure:
# Default is `ingress` because it works out-of-the-box on any cluster
# without requiring Gateway API CRDs. Production deployments should
# prefer `gatewayApi` per HELM_DEPLOYMENT_PLAN.md — switch the mode
# and supply gatewayClassName.
mode: ingress
# Gateway API config (when mode=gatewayApi).
gatewayApi:
# Set to false to skip rendering the Gateway resource and instead
# attach HTTPRoutes to a pre-existing Gateway (parentRef.name below).
createGateway: true
gatewayClassName: "" # required when createGateway=true (e.g. "istio", "envoy-gateway", "aws-alb")
listenerHostname: "" # optional SNI hostname for the listener; empty = wildcard
# Reference an existing Gateway instead of creating one.
# Ignored when createGateway=true.
parentRefs:
- name: dograh
namespace: "" # empty = same namespace as the release
# Ingress config (when mode=ingress).
ingress:
className: "" # e.g. "nginx", "alb"
annotations: {}
# Hostname for the API/UI. UI is served at / and API under /api/.
# MinIO browser-visible path uses the same hostname under /voice-audio/.
host: "" # e.g. dograh.example.com
tls:
enabled: false
secretName: "" # operator-managed TLS secret in the release namespace
# -----------------------------------------------------------------------------
# Non-sensitive runtime config — rendered into a ConfigMap and injected via
# envFrom on every backend pod. Sensitive values live under `secrets:` below.
# -----------------------------------------------------------------------------
config:
environment: production
logLevel: INFO
backendApiEndpoint: "" # public URL the browser uses to reach the API; auto-derived from exposure.host if empty in NOTES
minioBucket: voice-audio
minioEndpoint: "" # internal cluster endpoint (auto-set when internalMinio)
minioPublicEndpoint: "" # browser-visible endpoint (auto-set when ingress/gateway path exposes MinIO)
minioSecure: false
enableAwsS3: false
enableTelemetry: true
posthogHost: https://us.i.posthog.com
posthogApiKey: phc_ItizB1dP6yv7ZYobbcqrpxTdbomDA8hJFSEmAMdYvIr
forceTurnRelay: false
turnHost: "" # public hostname/IP of coturn (the LoadBalancer address)
fastapiWorkers: 1 # informational only; web tier scales by pod, not in-pod workers
# -----------------------------------------------------------------------------
# Secrets — rendered into a Kubernetes Secret unless secrets.existingSecret is
# set. NEVER commit real values here; override via -f overrides.yaml or
# --set-string at install time.
# -----------------------------------------------------------------------------
secrets:
# If set, the chart skips rendering its own Secret and assumes this Secret
# already exists in the release namespace with all keys below.
existingSecret: ""
# Required when database.mode=external.
databaseUrl: "" # e.g. postgresql+asyncpg://user:pass@host:5432/dograh
# Required when redis.mode=external.
redisUrl: "" # e.g. redis://:pass@host:6379
# MinIO / S3 credentials.
minioAccessKey: ""
minioSecretKey: ""
awsAccessKeyId: "" # only used when storage.mode=s3 and not using IRSA
awsSecretAccessKey: ""
# JWT signing key for the OSS auth path. MUST be overridden in production.
ossJwtSecret: "ChangeMeInProduction"
# TURN REST API shared secret (matches coturn.staticAuthSecret below).
turnSecret: ""
# Optional Langfuse tracing.
langfuseSecretKey: ""
langfusePublicKey: ""
langfuseHost: ""
# -----------------------------------------------------------------------------
# Shared /tmp PVC.
#
# AUDIT FINDING: api/services/pipecat/event_handlers.py writes WAV/transcript
# tempfiles in the web process and enqueues an ARQ job that reads those exact
# paths in the worker (api/tasks/s3_upload.py). In compose this works because
# all processes share the `shared-tmp` volume. In Kubernetes web and worker run
# in separate pods. Options:
# 1. Enable this PVC (ReadWriteMany required) to mount /tmp/dograh-shared
# into both web and arq-worker pods. Use this for v1.
# 2. Refactor event_handlers.py to upload from the web process and pass a
# storage key (not a local path) to the ARQ job. Preferred long-term;
# see deploy/helm/dograh/README.md "Open TODOs".
# If your cluster lacks RWX (most cloud default storage classes are RWO),
# you MUST take option (2) before splitting web and worker pods, or end-of-
# call uploads will fail silently.
sharedTmp:
enabled: false
storageClassName: "" # must be an RWX-capable class (e.g. efs-sc, azurefile, longhorn-rwx)
size: 10Gi
mountPath: /tmp/dograh-shared
# -----------------------------------------------------------------------------
# Web tier (FastAPI + WebSocket signaling)
# -----------------------------------------------------------------------------
web:
replicaCount: 2
port: 8000
# Long-lived signaling WebSockets keep per-connection state in process
# memory (api/routes/webrtc_signaling.py). A naive pod restart drops every
# in-flight call. The two settings below give the gateway time to stop
# routing new connections to a terminating pod and give in-flight calls
# time to finish.
terminationGracePeriodSeconds: 600
# preStop sleep: long enough for the load balancer to observe the pod going
# NotReady and stop sending new connections. 15s is conservative for most
# controllers (gateway/nginx/ALB).
preStopSleepSeconds: 15
resources:
# These are conservative starting numbers. Tune to your workload —
# WebRTC signaling is mostly idle but bursty during call setup.
requests:
cpu: 200m
memory: 512Mi
limits:
cpu: "2"
memory: 2Gi
# Distinct probes so the pod can fail readiness during drain without being
# killed for liveness. liveness has a longer threshold (process is alive)
# while readiness flips quickly (stop receiving new connections).
livenessProbe:
httpGet:
path: /api/v1/health
port: 8000
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
readinessProbe:
httpGet:
path: /api/v1/health
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 2
service:
type: ClusterIP
port: 8000
annotations: {}
pdb:
enabled: true
minAvailable: 1
podAnnotations: {}
nodeSelector: {}
tolerations: []
# Recommend spreading web pods across zones / nodes.
topologySpreadConstraints: []
affinity: {}
# -----------------------------------------------------------------------------
# ARQ background workers
# -----------------------------------------------------------------------------
workers:
replicaCount: 1
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: "1"
memory: 1Gi
# exec probe — workers have no HTTP endpoint.
livenessProbe:
exec:
command: ["sh", "-c", "pgrep -f 'arq api.tasks.arq.WorkerSettings' > /dev/null"]
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
podAnnotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
# -----------------------------------------------------------------------------
# ARI manager — TELEPHONY SINGLETON
#
# Maintains an outbound WebSocket to Asterisk and is the canonical receiver of
# ARI events. Running >1 replica produces duplicate event handling. The chart
# hard-codes replicas:1 and strategy:Recreate; there is NO replica knob here
# on purpose. Add proper leader election before relaxing this.
# -----------------------------------------------------------------------------
ariManager:
enabled: true
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
livenessProbe:
exec:
command: ["sh", "-c", "pgrep -f api.services.telephony.ari_manager > /dev/null"]
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
podAnnotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
# -----------------------------------------------------------------------------
# Campaign orchestrator — CAMPAIGN SINGLETON
#
# Uses in-memory deduplication locks (api/services/campaign/campaign_orchestrator.py
# `_processing_locks`). Running >1 replica would silently break scheduling.
# Same singleton rules as ariManager: no replica knob, Recreate strategy.
# -----------------------------------------------------------------------------
campaignOrchestrator:
enabled: true
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
livenessProbe:
exec:
command: ["sh", "-c", "pgrep -f api.services.campaign.campaign_orchestrator > /dev/null"]
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
podAnnotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
# -----------------------------------------------------------------------------
# Next.js UI
# -----------------------------------------------------------------------------
ui:
enabled: true
replicaCount: 2
image:
registry: docker.io
repository: dograhai/dograh-ui
tag: latest
pullPolicy: IfNotPresent
port: 3010
# Server-side (SSR) URL. Defaults to the in-cluster web Service.
backendUrl: "" # auto-set in template when empty
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
livenessProbe:
httpGet:
path: /
port: 3010
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /
port: 3010
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 2
service:
type: ClusterIP
port: 3010
pdb:
enabled: true
minAvailable: 1
podAnnotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
# -----------------------------------------------------------------------------
# coturn — TURN media relay
# -----------------------------------------------------------------------------
coturn:
enabled: true
image:
registry: docker.io
repository: coturn/coturn
tag: "4.8.0"
pullPolicy: IfNotPresent
# External IP advertised by coturn for NAT traversal. This is the
# LoadBalancer IP of the coturn Service. There is a chicken-and-egg here:
# the LB IP may not be known until after install. See NOTES.txt for the
# supported workflow (install with placeholder, kubectl get svc, helm
# upgrade --set coturn.externalIp=<IP>).
externalIp: ""
realm: dograh.com
# Coturn uses TURN REST API authentication (HMAC-SHA1). The secret here
# MUST match secrets.turnSecret — the chart will warn at install time if
# they diverge.
staticAuthSecretFromSecretsKey: turnSecret
# Relay port range. AWS NLB has a default quota of 50 listeners per LB,
# so the default 49 ports (49152-49200) sits just inside the limit.
# Increasing this requires either a higher NLB listener quota or
# additional TURN deployments.
relayPortRange:
min: 49152
max: 49200
# Standard TURN ports.
ports:
plain: 3478
tls: 5349
# TLS for turns:// — NOT WIRED IN v1. The original docker-compose exposes
# 5349 but does not configure cert paths. v1 scopes to plain TURN over
# UDP/TCP. See README.md "Open TODOs".
tls:
enabled: false
service:
type: LoadBalancer
annotations: {}
# externalTrafficPolicy: Local preserves the client IP, which TURN auth
# benefits from. Some LBs need this set to "Cluster" to be reachable.
externalTrafficPolicy: Local
resources:
requests:
cpu: 200m
memory: 256Mi
limits:
cpu: "2"
memory: 1Gi
podAnnotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
# -----------------------------------------------------------------------------
# Migration Job
# -----------------------------------------------------------------------------
migrate:
# Run alembic upgrade head as a pre-install / pre-upgrade Helm hook.
enabled: true
# Hard cap on how long a migration may run. A failed/exceeded migration
# rolls back the install/upgrade because backoffLimit is 0.
activeDeadlineSeconds: 600
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
# -----------------------------------------------------------------------------
# Horizontal Pod Autoscaling — web tier only.
#
# WARNING: CPU/memory is a poor signal for WebRTC signaling workloads.
# WebSockets are long-lived, low-CPU, and steady-memory; CPU will look flat
# while you saturate per-pod connection limits. Replace this with a custom
# metric (active WS connections, active calls) once one is exposed.
# -----------------------------------------------------------------------------
autoscaling:
web:
enabled: false
minReplicas: 2
maxReplicas: 10
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
# -----------------------------------------------------------------------------
# Bitnami subcharts. Each is gated by its own `enabled` flag set by mode
# resolution in templates/_helpers.tpl. Override the subchart's own values
# under these keys.
# -----------------------------------------------------------------------------
postgresql:
# enabled is set automatically based on database.mode in _helpers.tpl;
# this key is read by the subchart's `condition`.
enabled: true
auth:
username: dograh
password: "" # auto-generated if empty
database: dograh
primary:
persistence:
enabled: true
size: 8Gi
redisInternal:
# Bitnami Redis subchart values, aliased to redisInternal to avoid
# colliding with `redis.mode` above. `redisInternal.enabled` is the gating
# flag for whether the subchart deploys.
enabled: true
auth:
enabled: true
password: "" # auto-generated if empty
master:
persistence:
enabled: true
size: 8Gi
replica:
replicaCount: 0 # standalone primary by default
minio:
enabled: true
auth:
rootUser: minioadmin
rootPassword: "" # auto-generated if empty
defaultBuckets: "voice-audio"
persistence:
enabled: true
size: 20Gi