mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
feat: add rolling updates for production deployment (#175)
* feat: rolling update uvicorn workers * script fixes
This commit is contained in:
parent
ad6261333d
commit
aed5a782fb
10 changed files with 785 additions and 497 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -8,10 +8,10 @@ __pycache__
|
|||
/logs/
|
||||
/run/
|
||||
infrastructure/
|
||||
nginx/
|
||||
prd/
|
||||
.vercel
|
||||
|
||||
venv/
|
||||
.venv/
|
||||
.playwright-mcp
|
||||
coturn/
|
||||
|
|
@ -29,7 +29,7 @@ dograh/
|
|||
|
||||
```bash
|
||||
# Start infrastructure services (postgres, redis, minio)
|
||||
./scripts/start_services.sh --dev
|
||||
./scripts/start_services_dev.sh
|
||||
|
||||
# Stop all services
|
||||
./scripts/stop_services.sh
|
||||
|
|
|
|||
|
|
@ -59,17 +59,15 @@ ENV PYTHONUNBUFFERED=1
|
|||
|
||||
# Copy application code
|
||||
COPY ./api ./api
|
||||
COPY ./scripts/start_services.sh ./scripts/start_services.sh
|
||||
COPY ./scripts/start_services_dev.sh ./scripts/start_services_dev.sh
|
||||
|
||||
ENV PYTHONPATH=/app
|
||||
|
||||
# Disable file logging in Docker - logs go to stdout for docker logs
|
||||
ENV LOG_TO_FILE=false
|
||||
# Keep container alive by waiting for background processes
|
||||
ENV WAIT_FOR_PROCESSES=true
|
||||
|
||||
# Expose the port FastAPI will run on
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the FastAPI app with uvicorn
|
||||
CMD ["./scripts/start_services.sh"]
|
||||
CMD ["./scripts/start_services_dev.sh"]
|
||||
|
|
@ -59,7 +59,7 @@ bash scripts/setup_pipecat.sh
|
|||
```
|
||||
10. Start backend services
|
||||
```
|
||||
bash scripts/start_services.sh --dev
|
||||
bash scripts/start_services_dev.sh
|
||||
```
|
||||
Verify that your backend server is running
|
||||
```
|
||||
|
|
|
|||
7
nginx/dograh_upstream.conf.template
Normal file
7
nginx/dograh_upstream.conf.template
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# AUTO-GENERATED by start_services.sh / rolling_update.sh
|
||||
# Do not edit the generated file directly - edit this template instead.
|
||||
upstream dograh_backend {
|
||||
least_conn;
|
||||
|
||||
{{UVICORN_UPSTREAM_SERVERS}}
|
||||
}
|
||||
471
scripts/rolling_update.sh
Executable file
471
scripts/rolling_update.sh
Executable file
|
|
@ -0,0 +1,471 @@
|
|||
#!/usr/bin/env bash
|
||||
# rolling_update.sh — Zero-downtime rolling update using dual-band port strategy
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/rolling_update.sh
|
||||
# DRAIN_TIMEOUT=600 ./scripts/rolling_update.sh
|
||||
#
|
||||
# Old workers drain active calls (WebSocket/WebRTC) before shutting down.
|
||||
# Nginx switches to new workers only after every one passes health checks.
|
||||
# On failure at any phase, the script rolls back: kills new workers, leaves
|
||||
# old workers and nginx untouched.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
###############################################################################
|
||||
### CONFIGURATION
|
||||
###############################################################################
|
||||
|
||||
BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)"
|
||||
|
||||
ENV_FILE="$BASE_DIR/api/.env"
|
||||
RUN_DIR="$BASE_DIR/run"
|
||||
BASE_LOG_DIR="$BASE_DIR/logs"
|
||||
LATEST_LINK="$BASE_LOG_DIR/latest"
|
||||
VENV_PATH="$BASE_DIR/venv"
|
||||
|
||||
NGINX_UPSTREAM_TEMPLATE="$BASE_DIR/nginx/dograh_upstream.conf.template"
|
||||
NGINX_UPSTREAM_CONF="/etc/nginx/conf.d/dograh_upstream.conf"
|
||||
|
||||
HEALTH_CHECK_ENDPOINT="/api/v1/health"
|
||||
|
||||
# Load environment
|
||||
if [[ -f "$ENV_FILE" ]]; then
|
||||
set -a && . "$ENV_FILE" && set +a
|
||||
fi
|
||||
|
||||
UVICORN_BASE_PORT=${UVICORN_BASE_PORT:-8000}
|
||||
CPU_CORES=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
||||
FASTAPI_WORKERS=${FASTAPI_WORKERS:-$CPU_CORES}
|
||||
ARQ_WORKERS=${ARQ_WORKERS:-1}
|
||||
|
||||
# Tuning knobs (override via environment)
|
||||
DRAIN_TIMEOUT=${DRAIN_TIMEOUT:-300} # seconds to wait for old workers to drain
|
||||
HEALTH_MAX_ATTEMPTS=${HEALTH_MAX_ATTEMPTS:-30} # per-worker health-check retries
|
||||
HEALTH_INTERVAL=${HEALTH_INTERVAL:-2} # seconds between health-check retries
|
||||
|
||||
cd "$BASE_DIR"
|
||||
|
||||
###############################################################################
|
||||
### HELPERS
|
||||
###############################################################################
|
||||
|
||||
log_info() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: $*"; }
|
||||
log_warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] WARN: $*"; }
|
||||
log_error() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2; }
|
||||
|
||||
# Band port calculation: band A = base, band B = base + 100
|
||||
band_base_port() {
|
||||
local band=$1
|
||||
if [[ "$band" == "A" ]]; then
|
||||
echo "$UVICORN_BASE_PORT"
|
||||
else
|
||||
echo $((UVICORN_BASE_PORT + 100))
|
||||
fi
|
||||
}
|
||||
|
||||
opposite_band() {
|
||||
if [[ "$1" == "A" ]]; then echo "B"; else echo "A"; fi
|
||||
}
|
||||
|
||||
# Get all descendant PIDs of a process
|
||||
get_descendants() {
|
||||
local parent_pid=$1
|
||||
local descendants=""
|
||||
local children
|
||||
children=$(pgrep -P "$parent_pid" 2>/dev/null || true)
|
||||
for child in $children; do
|
||||
descendants="$descendants $child $(get_descendants "$child")"
|
||||
done
|
||||
echo "$descendants"
|
||||
}
|
||||
|
||||
# Kill a process and all its descendants
|
||||
kill_process_tree() {
|
||||
local pid=$1
|
||||
local signal=$2
|
||||
local descendants
|
||||
descendants=$(get_descendants "$pid")
|
||||
for desc_pid in $descendants; do
|
||||
if kill -0 "$desc_pid" 2>/dev/null; then
|
||||
kill "$signal" "$desc_pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$signal" "$pid" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
### ROLLBACK
|
||||
###############################################################################
|
||||
|
||||
# Kill all new-band workers and leave old workers + nginx untouched
|
||||
rollback_new_workers() {
|
||||
local new_band=$1
|
||||
local new_base
|
||||
new_base=$(band_base_port "$new_band")
|
||||
|
||||
log_error "ROLLING BACK — killing new band $new_band workers"
|
||||
|
||||
for ((w = 0; w < FASTAPI_WORKERS; w++)); do
|
||||
local port=$((new_base + w))
|
||||
local pidfile="$RUN_DIR/uvicorn_${port}.pid"
|
||||
if [[ -f "$pidfile" ]]; then
|
||||
local pid
|
||||
pid=$(<"$pidfile")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill_process_tree "$pid" "-KILL"
|
||||
log_info " Killed uvicorn_${port} (PID $pid)"
|
||||
fi
|
||||
rm -f "$pidfile"
|
||||
fi
|
||||
done
|
||||
|
||||
log_error "Rollback complete. Old workers and nginx are untouched."
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
### PHASE 0: PRE-FLIGHT CHECKS
|
||||
###############################################################################
|
||||
|
||||
log_info "=== Phase 0: Pre-flight checks ==="
|
||||
|
||||
# Determine current and new band
|
||||
if [[ -f "$RUN_DIR/active_band" ]]; then
|
||||
OLD_BAND=$(<"$RUN_DIR/active_band")
|
||||
else
|
||||
log_error "No active_band file found in $RUN_DIR. Run start_services.sh first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NEW_BAND=$(opposite_band "$OLD_BAND")
|
||||
OLD_BASE=$(band_base_port "$OLD_BAND")
|
||||
NEW_BASE=$(band_base_port "$NEW_BAND")
|
||||
|
||||
log_info "Current band: $OLD_BAND (ports ${OLD_BASE}–$((OLD_BASE + FASTAPI_WORKERS - 1)))"
|
||||
log_info "New band: $NEW_BAND (ports ${NEW_BASE}–$((NEW_BASE + FASTAPI_WORKERS - 1)))"
|
||||
|
||||
# Verify at least one old worker is running
|
||||
old_running=0
|
||||
for ((w = 0; w < FASTAPI_WORKERS; w++)); do
|
||||
port=$((OLD_BASE + w))
|
||||
pidfile="$RUN_DIR/uvicorn_${port}.pid"
|
||||
if [[ -f "$pidfile" ]]; then
|
||||
pid=$(<"$pidfile")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
old_running=$((old_running + 1))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ $old_running -eq 0 ]]; then
|
||||
log_error "No old workers are running. Use start_services.sh for a cold start."
|
||||
exit 1
|
||||
fi
|
||||
log_info "Found $old_running running old worker(s)"
|
||||
|
||||
# Verify new ports are free
|
||||
for ((w = 0; w < FASTAPI_WORKERS; w++)); do
|
||||
port=$((NEW_BASE + w))
|
||||
if ss -tln "sport = :$port" | grep -q LISTEN; then
|
||||
log_error "Port $port is already in use. Cannot start new band."
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
log_info "All new-band ports are free"
|
||||
|
||||
# Verify nginx is running
|
||||
if ! pgrep -x nginx >/dev/null 2>&1; then
|
||||
log_error "nginx is not running."
|
||||
exit 1
|
||||
fi
|
||||
log_info "nginx is running"
|
||||
|
||||
###############################################################################
|
||||
### PHASE 1: RUN MIGRATIONS
|
||||
###############################################################################
|
||||
|
||||
log_info "=== Phase 1: Running Alembic migrations ==="
|
||||
|
||||
# Activate virtual environment
|
||||
if [[ -d "$VENV_PATH" && -f "$VENV_PATH/bin/activate" ]]; then
|
||||
source "$VENV_PATH/bin/activate"
|
||||
else
|
||||
log_warn "No virtual environment at $VENV_PATH, continuing without"
|
||||
fi
|
||||
|
||||
if ! alembic -c "$BASE_DIR/api/alembic.ini" upgrade head; then
|
||||
log_error "Alembic migration failed. Aborting — nothing has been touched."
|
||||
exit 1
|
||||
fi
|
||||
log_info "Migrations complete"
|
||||
|
||||
###############################################################################
|
||||
### PHASE 2: START NEW WORKERS
|
||||
###############################################################################
|
||||
|
||||
log_info "=== Phase 2: Starting new workers on band $NEW_BAND ==="
|
||||
|
||||
# Resolve log directory
|
||||
if [[ -L "$LATEST_LINK" && -d "$LATEST_LINK" ]]; then
|
||||
LOG_DIR="$BASE_LOG_DIR/$(readlink "$LATEST_LINK")"
|
||||
else
|
||||
# Create a new timestamped log dir for this deploy
|
||||
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
||||
LOG_DIR="$BASE_LOG_DIR/$TIMESTAMP"
|
||||
mkdir -p "$LOG_DIR"
|
||||
rm -f "$LATEST_LINK"
|
||||
ln -s "$TIMESTAMP" "$LATEST_LINK"
|
||||
fi
|
||||
|
||||
mkdir -p "$RUN_DIR"
|
||||
|
||||
for ((w = 0; w < FASTAPI_WORKERS; w++)); do
|
||||
port=$((NEW_BASE + w))
|
||||
name="uvicorn_${port}"
|
||||
log_info " Starting $name on port $port"
|
||||
|
||||
(
|
||||
cd "$BASE_DIR"
|
||||
export LOG_FILE_PATH="$LOG_DIR/${name}.log"
|
||||
exec uvicorn api.app:app --host 127.0.0.1 --port "$port" \
|
||||
>>"$LOG_DIR/${name}.log" 2>&1
|
||||
) &
|
||||
|
||||
pid=$!
|
||||
echo "$pid" > "$RUN_DIR/${name}.pid"
|
||||
log_info " PID $pid"
|
||||
done
|
||||
|
||||
# Brief pause to let workers bind
|
||||
sleep 3
|
||||
|
||||
# Quick sanity: make sure they haven't crashed immediately
|
||||
for ((w = 0; w < FASTAPI_WORKERS; w++)); do
|
||||
port=$((NEW_BASE + w))
|
||||
pid=$(<"$RUN_DIR/uvicorn_${port}.pid")
|
||||
if ! kill -0 "$pid" 2>/dev/null; then
|
||||
log_error "Worker uvicorn_${port} (PID $pid) died immediately"
|
||||
rollback_new_workers "$NEW_BAND"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
log_info "All $FASTAPI_WORKERS new workers started"
|
||||
|
||||
###############################################################################
|
||||
### PHASE 3: HEALTH-CHECK EVERY NEW WORKER
|
||||
###############################################################################
|
||||
|
||||
log_info "=== Phase 3: Health-checking new workers ==="
|
||||
|
||||
all_healthy=true
|
||||
for ((w = 0; w < FASTAPI_WORKERS; w++)); do
|
||||
port=$((NEW_BASE + w))
|
||||
healthy=false
|
||||
|
||||
for ((attempt = 1; attempt <= HEALTH_MAX_ATTEMPTS; attempt++)); do
|
||||
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
"http://127.0.0.1:${port}${HEALTH_CHECK_ENDPOINT}" 2>/dev/null || echo "000")
|
||||
|
||||
if [[ "$http_code" == "200" ]]; then
|
||||
log_info " uvicorn_${port} healthy (attempt $attempt)"
|
||||
healthy=true
|
||||
break
|
||||
fi
|
||||
sleep "$HEALTH_INTERVAL"
|
||||
done
|
||||
|
||||
if ! $healthy; then
|
||||
log_error " uvicorn_${port} FAILED health check after $HEALTH_MAX_ATTEMPTS attempts"
|
||||
all_healthy=false
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if ! $all_healthy; then
|
||||
rollback_new_workers "$NEW_BAND"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "All new workers are healthy"
|
||||
|
||||
###############################################################################
|
||||
### PHASE 4: SWITCH NGINX TO NEW BAND
|
||||
###############################################################################
|
||||
|
||||
log_info "=== Phase 4: Switching nginx to band $NEW_BAND ==="
|
||||
|
||||
if [[ ! -f "$NGINX_UPSTREAM_TEMPLATE" ]]; then
|
||||
log_error "Nginx upstream template not found at $NGINX_UPSTREAM_TEMPLATE"
|
||||
rollback_new_workers "$NEW_BAND"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Build upstream server list from new-band ports
|
||||
UPSTREAM_SERVERS=""
|
||||
for ((w = 0; w < FASTAPI_WORKERS; w++)); do
|
||||
port=$((NEW_BASE + w))
|
||||
UPSTREAM_SERVERS="${UPSTREAM_SERVERS} server 127.0.0.1:${port};\n"
|
||||
done
|
||||
|
||||
# Generate upstream config
|
||||
sed -e "s|{{UVICORN_UPSTREAM_SERVERS}}|${UPSTREAM_SERVERS}|" \
|
||||
"$NGINX_UPSTREAM_TEMPLATE" | sudo tee "$NGINX_UPSTREAM_CONF" > /dev/null
|
||||
|
||||
log_info "Generated nginx upstream config with $FASTAPI_WORKERS workers (ports ${NEW_BASE}–$((NEW_BASE + FASTAPI_WORKERS - 1)))"
|
||||
|
||||
# Validate config
|
||||
if ! sudo nginx -t 2>/dev/null; then
|
||||
log_error "nginx config validation failed!"
|
||||
sudo nginx -t 2>&1 || true
|
||||
# Restore old upstream config
|
||||
OLD_UPSTREAM=""
|
||||
for ((w = 0; w < FASTAPI_WORKERS; w++)); do
|
||||
port=$((OLD_BASE + w))
|
||||
OLD_UPSTREAM="${OLD_UPSTREAM} server 127.0.0.1:${port};\n"
|
||||
done
|
||||
sed -e "s|{{UVICORN_UPSTREAM_SERVERS}}|${OLD_UPSTREAM}|" \
|
||||
"$NGINX_UPSTREAM_TEMPLATE" | sudo tee "$NGINX_UPSTREAM_CONF" > /dev/null
|
||||
|
||||
rollback_new_workers "$NEW_BAND"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Reload nginx (graceful — finishes in-flight requests to old upstream)
|
||||
sudo systemctl reload nginx
|
||||
log_info "nginx reloaded — traffic now routed to band $NEW_BAND"
|
||||
|
||||
###############################################################################
|
||||
### PHASE 5: DRAIN OLD WORKERS
|
||||
###############################################################################
|
||||
|
||||
log_info "=== Phase 5: Draining old workers (band $OLD_BAND, timeout ${DRAIN_TIMEOUT}s) ==="
|
||||
|
||||
# Collect old worker PIDs
|
||||
OLD_PIDS=()
|
||||
for ((w = 0; w < FASTAPI_WORKERS; w++)); do
|
||||
port=$((OLD_BASE + w))
|
||||
pidfile="$RUN_DIR/uvicorn_${port}.pid"
|
||||
if [[ -f "$pidfile" ]]; then
|
||||
pid=$(<"$pidfile")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
OLD_PIDS+=("$pid")
|
||||
log_info " Sending SIGTERM to uvicorn_${port} (PID $pid)"
|
||||
kill_process_tree "$pid" "-TERM"
|
||||
fi
|
||||
rm -f "$pidfile"
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#OLD_PIDS[@]} -gt 0 ]]; then
|
||||
start_time=$(date +%s)
|
||||
|
||||
while true; do
|
||||
all_dead=true
|
||||
for pid in "${OLD_PIDS[@]}"; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
all_dead=false
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if $all_dead; then
|
||||
log_info "All old workers exited gracefully"
|
||||
break
|
||||
fi
|
||||
|
||||
elapsed=$(( $(date +%s) - start_time ))
|
||||
if [[ $elapsed -ge $DRAIN_TIMEOUT ]]; then
|
||||
log_warn "Drain timeout reached (${DRAIN_TIMEOUT}s). Force-killing remaining old workers."
|
||||
for pid in "${OLD_PIDS[@]}"; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill_process_tree "$pid" "-KILL"
|
||||
log_warn " Force-killed PID $pid"
|
||||
fi
|
||||
done
|
||||
sleep 1
|
||||
break
|
||||
fi
|
||||
|
||||
log_info " Waiting for old workers to drain... (${elapsed}s / ${DRAIN_TIMEOUT}s)"
|
||||
sleep 5
|
||||
done
|
||||
else
|
||||
log_warn "No old worker PIDs to drain"
|
||||
fi
|
||||
|
||||
###############################################################################
|
||||
### PHASE 6: RESTART NON-HTTP SERVICES
|
||||
###############################################################################
|
||||
|
||||
log_info "=== Phase 6: Restarting non-HTTP services ==="
|
||||
|
||||
# Services to restart (same as start_services.sh)
|
||||
RESTART_NAMES=(
|
||||
"ari_manager"
|
||||
"campaign_orchestrator"
|
||||
)
|
||||
RESTART_COMMANDS=(
|
||||
"python -m api.services.telephony.ari_manager"
|
||||
"python -m api.services.campaign.campaign_orchestrator"
|
||||
)
|
||||
|
||||
# Add ARQ workers
|
||||
for ((i = 1; i <= ARQ_WORKERS; i++)); do
|
||||
RESTART_NAMES+=("arq$i")
|
||||
RESTART_COMMANDS+=("python -m arq api.tasks.arq.WorkerSettings --custom-log-dict api.tasks.arq.LOG_CONFIG")
|
||||
done
|
||||
|
||||
for i in "${!RESTART_NAMES[@]}"; do
|
||||
name="${RESTART_NAMES[$i]}"
|
||||
cmd="${RESTART_COMMANDS[$i]}"
|
||||
pidfile="$RUN_DIR/${name}.pid"
|
||||
|
||||
# Stop old instance
|
||||
if [[ -f "$pidfile" ]]; then
|
||||
oldpid=$(<"$pidfile")
|
||||
if kill -0 "$oldpid" 2>/dev/null; then
|
||||
log_info " Stopping $name (PID $oldpid)"
|
||||
kill_process_tree "$oldpid" "-TERM"
|
||||
sleep 2
|
||||
if kill -0 "$oldpid" 2>/dev/null; then
|
||||
kill_process_tree "$oldpid" "-KILL"
|
||||
sleep 1
|
||||
fi
|
||||
fi
|
||||
rm -f "$pidfile"
|
||||
fi
|
||||
|
||||
# Start new instance
|
||||
log_info " Starting $name"
|
||||
(
|
||||
cd "$BASE_DIR"
|
||||
export LOG_FILE_PATH="$LOG_DIR/${name}.log"
|
||||
exec $cmd >>"$LOG_DIR/${name}.log" 2>&1
|
||||
) &
|
||||
|
||||
pid=$!
|
||||
echo "$pid" > "$RUN_DIR/${name}.pid"
|
||||
log_info " PID $pid"
|
||||
done
|
||||
|
||||
###############################################################################
|
||||
### PHASE 7: FINALIZE
|
||||
###############################################################################
|
||||
|
||||
log_info "=== Phase 7: Finalize ==="
|
||||
|
||||
echo "$NEW_BAND" > "$RUN_DIR/active_band"
|
||||
log_info "active_band set to $NEW_BAND"
|
||||
|
||||
echo
|
||||
echo "══════════════════════════════════════════════════"
|
||||
echo " Rolling update completed successfully"
|
||||
echo ""
|
||||
echo " Band: $OLD_BAND → $NEW_BAND"
|
||||
echo " Workers: $FASTAPI_WORKERS (ports ${NEW_BASE}–$((NEW_BASE + FASTAPI_WORKERS - 1)))"
|
||||
echo " Services: ${RESTART_NAMES[*]}"
|
||||
echo " Logs: $LOG_DIR"
|
||||
echo "══════════════════════════════════════════════════"
|
||||
|
|
@ -1,324 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# rolling_update_uvicorn.sh — Zero-downtime rolling update for uvicorn workers
|
||||
|
||||
set -e # Exit on error
|
||||
|
||||
### CONFIGURATION #############################################################
|
||||
|
||||
# Determine BASE_DIR as parent of the scripts directory
|
||||
BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)"
|
||||
|
||||
ENV_FILE="$BASE_DIR/api/.env"
|
||||
RUN_DIR="$BASE_DIR/run"
|
||||
BASE_LOG_DIR="$BASE_DIR/logs" # Base logs directory (same as start_services.sh)
|
||||
LATEST_LINK="$BASE_LOG_DIR/latest" # Symlink to latest logs (same as start_services.sh)
|
||||
VENV_PATH="$BASE_DIR/venv"
|
||||
HEALTH_CHECK_ENDPOINT="/api/v1/health" # Adjust as needed
|
||||
MAX_WAIT_SECONDS=310 # Max wait for graceful shutdown (5 minutes + 10 seconds grace)
|
||||
|
||||
# Load environment
|
||||
set -a && . "$ENV_FILE" && set +a
|
||||
|
||||
cd "$BASE_DIR"
|
||||
|
||||
### FUNCTIONS ##################################################################
|
||||
|
||||
log_info() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: $*"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] WARN: $*"
|
||||
}
|
||||
|
||||
check_port_availability() {
|
||||
local port=$1
|
||||
if lsof -Pi :$port -sTCP:LISTEN -t >/dev/null 2>&1; then
|
||||
return 1 # Port is in use
|
||||
fi
|
||||
return 0 # Port is available
|
||||
}
|
||||
|
||||
wait_for_health_check() {
|
||||
local port=$1
|
||||
local max_attempts=30
|
||||
local attempt=0
|
||||
|
||||
log_info "Waiting for new uvicorn workers to be healthy on port $port..."
|
||||
|
||||
while [ $attempt -lt $max_attempts ]; do
|
||||
if curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:${port}${HEALTH_CHECK_ENDPOINT}" | grep -q "200"; then
|
||||
log_info "Health check passed on port $port"
|
||||
return 0
|
||||
fi
|
||||
attempt=$((attempt + 1))
|
||||
log_info "Health check attempt $attempt/$max_attempts..."
|
||||
sleep 1
|
||||
done
|
||||
|
||||
log_error "Health check failed after $max_attempts attempts"
|
||||
return 1
|
||||
}
|
||||
|
||||
get_old_uvicorn_pids() {
|
||||
local pidfile="$RUN_DIR/uvicorn.pid"
|
||||
local pids=""
|
||||
|
||||
if [[ -f "$pidfile" ]]; then
|
||||
# Read the main PID
|
||||
local main_pid=$(<"$pidfile")
|
||||
if kill -0 "$main_pid" 2>/dev/null; then
|
||||
# Get all PIDs in the process group
|
||||
pids=$(ps -o pid= -g $(ps -o pgid= -p "$main_pid" | tr -d ' ') 2>/dev/null || echo "$main_pid")
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "$pids"
|
||||
}
|
||||
|
||||
graceful_shutdown_old_workers() {
|
||||
local old_pids="$1"
|
||||
|
||||
if [[ -z "$old_pids" ]]; then
|
||||
log_warning "No old uvicorn workers found to shut down"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_info "Starting graceful shutdown of old uvicorn workers (PIDs: $(echo $old_pids | tr '\n' ' '))"
|
||||
|
||||
# Send SIGTERM to trigger graceful shutdown
|
||||
for pid in $old_pids; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_info "Sending SIGTERM to PID $pid"
|
||||
kill -TERM "$pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Wait for processes to exit gracefully
|
||||
local start_time=$(date +%s)
|
||||
local all_dead=false
|
||||
|
||||
while [[ $(($(date +%s) - start_time)) -lt $MAX_WAIT_SECONDS ]]; do
|
||||
all_dead=true
|
||||
for pid in $old_pids; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
all_dead=false
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if $all_dead; then
|
||||
log_info "All old workers shut down gracefully"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_info "Waiting for workers to complete active requests... ($(( $(date +%s) - start_time ))s elapsed)"
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# Force kill if still running after timeout
|
||||
log_warning "Timeout reached, force killing remaining workers"
|
||||
for pid in $old_pids; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_warning "Force killing PID $pid"
|
||||
kill -KILL "$pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
sleep 1
|
||||
return 0
|
||||
}
|
||||
|
||||
start_new_uvicorn_workers() {
|
||||
local new_port=$1
|
||||
|
||||
log_info "Starting new uvicorn workers on port $new_port..."
|
||||
|
||||
# Get configuration from environment
|
||||
set -a && . "$ENV_FILE" && set +a
|
||||
|
||||
if [[ -z "${FASTAPI_WORKERS:-}" ]]; then
|
||||
log_error "FASTAPI_WORKERS environment variable is not set"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Activate virtual environment
|
||||
source ${VENV_PATH}/bin/activate
|
||||
|
||||
# Use the latest log directory created by start_services.sh
|
||||
local log_dir=""
|
||||
|
||||
# First, check if the symlink exists and points to a valid directory
|
||||
if [[ -L "$LATEST_LINK" ]] && [[ -d "$LATEST_LINK" ]]; then
|
||||
# Follow the symlink to get the actual directory
|
||||
log_dir="$BASE_LOG_DIR/$(readlink "$LATEST_LINK")"
|
||||
log_info "Using existing log directory: $log_dir"
|
||||
else
|
||||
log_error "No log directory found. Run start_services.sh first to create logs directory."
|
||||
log_error "Expected symlink at: $LATEST_LINK"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Create unique log filename using timestamp and script PID to avoid conflicts
|
||||
local script_pid=$$ # PID of this rolling_update script (for uniqueness)
|
||||
local timestamp=$(date '+%H%M%S')
|
||||
export LOG_FILE_PATH="$log_dir/uvicorn-rollover-${timestamp}-${script_pid}.log"
|
||||
|
||||
log_info "Starting uvicorn with $FASTAPI_WORKERS workers on port $new_port"
|
||||
log_info "Logs: $LOG_FILE_PATH"
|
||||
|
||||
# Start in background (same pattern as start_services.sh)
|
||||
(
|
||||
cd "$BASE_DIR"
|
||||
export LOG_FILE_PATH="$log_dir/uvicorn-rollover-${timestamp}-${script_pid}.log"
|
||||
exec uvicorn api.app:app --host 0.0.0.0 --port $new_port --workers $FASTAPI_WORKERS >>"$LOG_FILE_PATH" 2>&1
|
||||
) &
|
||||
|
||||
local new_pid=$!
|
||||
echo "$new_pid" > "$RUN_DIR/uvicorn_new.pid"
|
||||
|
||||
# Save port information
|
||||
echo "$new_port" > "$RUN_DIR/uvicorn_new.port"
|
||||
|
||||
log_info "New uvicorn started with PID $new_pid"
|
||||
|
||||
# Wait a bit for startup
|
||||
sleep 5
|
||||
|
||||
# Check if process is still running
|
||||
if ! kill -0 "$new_pid" 2>/dev/null; then
|
||||
log_error "New uvicorn process died immediately"
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
finalize_rollover() {
|
||||
log_info "Finalizing rollover..."
|
||||
|
||||
# Move new PID file to main PID file
|
||||
if [[ -f "$RUN_DIR/uvicorn_new.pid" ]]; then
|
||||
mv "$RUN_DIR/uvicorn_new.pid" "$RUN_DIR/uvicorn.pid"
|
||||
fi
|
||||
|
||||
# Store the new port for reference
|
||||
if [[ -f "$RUN_DIR/uvicorn_new.port" ]]; then
|
||||
mv "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn.port"
|
||||
fi
|
||||
|
||||
# Clean up old PID file if it exists
|
||||
rm -f "$RUN_DIR/uvicorn_old.pid"
|
||||
|
||||
log_info "Rollover completed successfully"
|
||||
}
|
||||
|
||||
rollback() {
|
||||
local old_port=$1
|
||||
local new_pid=$2
|
||||
|
||||
log_error "Rolling back due to failure..."
|
||||
|
||||
# Kill new workers if they exist
|
||||
if [[ -n "$new_pid" ]] && kill -0 "$new_pid" 2>/dev/null; then
|
||||
log_info "Killing new uvicorn workers (PID: $new_pid)"
|
||||
kill -KILL -"$new_pid" 2>/dev/null || kill -KILL "$new_pid" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Clean up temporary files
|
||||
rm -f "$RUN_DIR/uvicorn_new.pid" "$RUN_DIR/uvicorn_new.port"
|
||||
|
||||
log_error "Rollback completed"
|
||||
}
|
||||
|
||||
### MAIN LOGIC ################################################################
|
||||
|
||||
# Check arguments
|
||||
if [[ $# -ne 1 ]]; then
|
||||
echo "Usage: $0 <NEW_PORT>"
|
||||
echo "Example: $0 8001"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NEW_PORT=$1
|
||||
|
||||
# Validate port number
|
||||
if ! [[ "$NEW_PORT" =~ ^[0-9]+$ ]] || [ "$NEW_PORT" -lt 1 ] || [ "$NEW_PORT" -gt 65535 ]; then
|
||||
log_error "Invalid port number: $NEW_PORT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if port is available
|
||||
if ! check_port_availability "$NEW_PORT"; then
|
||||
log_error "Port $NEW_PORT is already in use"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get old port from file or environment
|
||||
OLD_PORT=""
|
||||
if [[ -f "$RUN_DIR/uvicorn.port" ]]; then
|
||||
OLD_PORT=$(<"$RUN_DIR/uvicorn.port")
|
||||
elif [[ -f "$ENV_FILE" ]]; then
|
||||
set -a && . "$ENV_FILE" && set +a
|
||||
OLD_PORT="${FASTAPI_PORT:-}"
|
||||
fi
|
||||
|
||||
if [[ "$NEW_PORT" == "$OLD_PORT" ]]; then
|
||||
log_error "New port is the same as old port ($NEW_PORT)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "Starting rolling update from port ${OLD_PORT:-unknown} to port $NEW_PORT"
|
||||
|
||||
# Create run directory if it doesn't exist
|
||||
mkdir -p "$RUN_DIR"
|
||||
|
||||
# Get old uvicorn PIDs before starting new ones
|
||||
OLD_PIDS=$(get_old_uvicorn_pids)
|
||||
if [[ -n "$OLD_PIDS" ]]; then
|
||||
# Save old PIDs for potential rollback
|
||||
echo "$OLD_PIDS" > "$RUN_DIR/uvicorn_old.pid"
|
||||
log_info "Found old uvicorn workers: $(echo $OLD_PIDS | tr '\n' ' ')"
|
||||
else
|
||||
log_warning "No existing uvicorn workers found"
|
||||
fi
|
||||
|
||||
# Start new uvicorn workers
|
||||
if ! start_new_uvicorn_workers "$NEW_PORT"; then
|
||||
log_error "Failed to start new uvicorn workers"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NEW_PID=$(<"$RUN_DIR/uvicorn_new.pid")
|
||||
|
||||
# Wait for new workers to be healthy
|
||||
if ! wait_for_health_check "$NEW_PORT"; then
|
||||
log_error "New workers failed health check"
|
||||
rollback "$OLD_PORT" "$NEW_PID"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Give the system some time to stabilize before shutting down old workers
|
||||
log_info "Waiting for system to stabilize..."
|
||||
sleep 5
|
||||
|
||||
# Gracefully shutdown old workers
|
||||
if [[ -n "$OLD_PIDS" ]]; then
|
||||
graceful_shutdown_old_workers "$OLD_PIDS"
|
||||
fi
|
||||
|
||||
# Finalize the rollover
|
||||
finalize_rollover
|
||||
|
||||
# Summary
|
||||
echo "──────────────────────────────────────────────────"
|
||||
echo "✓ Rolling update completed successfully"
|
||||
echo " Old port: ${OLD_PORT:-none}"
|
||||
echo " New port: $NEW_PORT"
|
||||
echo " New PID: $NEW_PID"
|
||||
echo " Logs: $BASE_LOG_DIR/$LATEST_LINK/"
|
||||
echo "──────────────────────────────────────────────────"
|
||||
|
|
@ -1,42 +1,6 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e # Exit on error
|
||||
|
||||
###############################################################################
|
||||
### ARGUMENT PARSING
|
||||
###############################################################################
|
||||
|
||||
DEV_MODE=false
|
||||
|
||||
show_help() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --dev Enable development mode with auto-reload for API changes"
|
||||
echo " --help Show this help message"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " $0 # Start in production mode"
|
||||
echo " $0 --dev # Start in development mode with auto-reload"
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--dev)
|
||||
DEV_MODE=true
|
||||
shift
|
||||
;;
|
||||
--help|-h)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
show_help
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
###############################################################################
|
||||
### CONFIGURATION
|
||||
###############################################################################
|
||||
|
|
@ -55,16 +19,10 @@ VENV_PATH="$BASE_DIR/venv"
|
|||
|
||||
ARQ_WORKERS=${ARQ_WORKERS:-1}
|
||||
LOG_TO_FILE=${LOG_TO_FILE:-true} # Set to false in Docker to use stdout
|
||||
WAIT_FOR_PROCESSES=${WAIT_FOR_PROCESSES:-false} # Set to true in Docker to keep container alive
|
||||
|
||||
# Log startup
|
||||
cd "$BASE_DIR"
|
||||
if $DEV_MODE; then
|
||||
echo "Starting Dograh Services (DEV MODE) at $(date) in BASE_DIR: ${BASE_DIR}"
|
||||
echo "Auto-reload enabled for api/ directory changes"
|
||||
else
|
||||
echo "Starting Dograh Services at $(date) in BASE_DIR: ${BASE_DIR}"
|
||||
fi
|
||||
echo "Starting Dograh Services at $(date) in BASE_DIR: ${BASE_DIR}"
|
||||
|
||||
###############################################################################
|
||||
### 1) Load environment variables
|
||||
|
|
@ -75,10 +33,34 @@ if [[ -f "$ENV_FILE" ]]; then
|
|||
set -a && . "$ENV_FILE" && set +a
|
||||
fi
|
||||
|
||||
FASTAPI_PORT=${FASTAPI_PORT:-8000}
|
||||
UVICORN_BASE_PORT=${UVICORN_BASE_PORT:-8000}
|
||||
CPU_CORES=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
||||
FASTAPI_WORKERS=${FASTAPI_WORKERS:-$CPU_CORES}
|
||||
|
||||
###############################################################################
|
||||
### 1b) Safety check — refuse to start over running services
|
||||
###############################################################################
|
||||
|
||||
if [[ -d "$RUN_DIR" ]]; then
|
||||
live_count=0
|
||||
for pidfile in "$RUN_DIR"/*.pid; do
|
||||
[[ -e "$pidfile" ]] || continue
|
||||
pid=$(<"$pidfile")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
live_count=$((live_count + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ $live_count -gt 0 ]]; then
|
||||
echo "ERROR: $live_count service(s) are still running."
|
||||
echo ""
|
||||
echo " Stop first: ./scripts/stop_services.sh"
|
||||
echo " For a zero-downtime deploy, use: ./scripts/rolling_update.sh"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
###############################################################################
|
||||
### 2) Define services
|
||||
###############################################################################
|
||||
|
|
@ -88,24 +70,20 @@ FASTAPI_WORKERS=${FASTAPI_WORKERS:-$CPU_CORES}
|
|||
SERVICE_NAMES=(
|
||||
"ari_manager"
|
||||
"campaign_orchestrator"
|
||||
"uvicorn"
|
||||
)
|
||||
|
||||
# Build uvicorn command based on mode
|
||||
if $DEV_MODE; then
|
||||
# Dev mode: single worker with auto-reload (--reload is incompatible with --workers > 1)
|
||||
UVICORN_CMD="uvicorn api.app:app --host 0.0.0.0 --port $FASTAPI_PORT --reload --reload-dir api"
|
||||
else
|
||||
# Production mode: multiple workers, no reload
|
||||
UVICORN_CMD="uvicorn api.app:app --host 0.0.0.0 --port $FASTAPI_PORT --workers $FASTAPI_WORKERS"
|
||||
fi
|
||||
|
||||
SERVICE_COMMANDS=(
|
||||
"python -m api.services.telephony.ari_manager"
|
||||
"python -m api.services.campaign.campaign_orchestrator"
|
||||
"$UVICORN_CMD"
|
||||
)
|
||||
|
||||
# Add uvicorn workers on separate ports (behind nginx least_conn)
|
||||
for ((w=0; w<FASTAPI_WORKERS; w++)); do
|
||||
port=$((UVICORN_BASE_PORT + w))
|
||||
SERVICE_NAMES+=("uvicorn_$port")
|
||||
SERVICE_COMMANDS+=("uvicorn api.app:app --host 127.0.0.1 --port $port")
|
||||
done
|
||||
|
||||
# Add ARQ workers dynamically
|
||||
for ((i=1; i<=ARQ_WORKERS; i++)); do
|
||||
SERVICE_NAMES+=("arq$i")
|
||||
|
|
@ -124,100 +102,19 @@ else
|
|||
echo "Continuing without virtual environment activation..."
|
||||
fi
|
||||
|
||||
###############################################################################
|
||||
### 4) Stop old services
|
||||
###############################################################################
|
||||
|
||||
mkdir -p "$RUN_DIR"
|
||||
|
||||
# Function to get all descendant PIDs of a process (children, grandchildren, etc.)
|
||||
get_descendants() {
|
||||
local parent_pid=$1
|
||||
local descendants=""
|
||||
local children
|
||||
|
||||
# Get direct children
|
||||
children=$(pgrep -P "$parent_pid" 2>/dev/null || true)
|
||||
|
||||
for child in $children; do
|
||||
# Recursively get descendants of each child
|
||||
descendants="$descendants $child $(get_descendants "$child")"
|
||||
done
|
||||
|
||||
echo "$descendants"
|
||||
}
|
||||
|
||||
# Function to kill a process and all its descendants
|
||||
kill_process_tree() {
|
||||
local pid=$1
|
||||
local signal=$2
|
||||
local descendants
|
||||
|
||||
descendants=$(get_descendants "$pid")
|
||||
|
||||
# Kill children first (bottom-up), then parent
|
||||
for desc_pid in $descendants; do
|
||||
if kill -0 "$desc_pid" 2>/dev/null; then
|
||||
kill "$signal" "$desc_pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Kill the parent
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$signal" "$pid" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
for name in "${SERVICE_NAMES[@]}"; do
|
||||
pidfile="$RUN_DIR/$name.pid"
|
||||
|
||||
if [[ -f $pidfile ]]; then
|
||||
oldpid=$(<"$pidfile")
|
||||
|
||||
if kill -0 "$oldpid" 2>/dev/null; then
|
||||
echo "Stopping $name (PID $oldpid and all descendants)…"
|
||||
|
||||
# Kill the entire process tree (parent + all descendants)
|
||||
kill_process_tree "$oldpid" "-TERM"
|
||||
sleep 4
|
||||
|
||||
# Check if parent or any descendants are still alive
|
||||
still_alive=false
|
||||
if kill -0 "$oldpid" 2>/dev/null; then
|
||||
still_alive=true
|
||||
else
|
||||
for desc_pid in $(get_descendants "$oldpid"); do
|
||||
if kill -0 "$desc_pid" 2>/dev/null; then
|
||||
still_alive=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if $still_alive; then
|
||||
echo "⚠️ $name did not exit cleanly, forcing stop..."
|
||||
kill_process_tree "$oldpid" "-KILL"
|
||||
sleep 1
|
||||
fi
|
||||
fi
|
||||
|
||||
rm -f "$pidfile"
|
||||
else
|
||||
echo "No PID file for $name, skipping stop."
|
||||
fi
|
||||
done
|
||||
|
||||
# Clean up any port tracking files for uvicorn
|
||||
rm -f "$RUN_DIR/uvicorn.port" "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn_old.pid"
|
||||
NGINX_UPSTREAM_TEMPLATE="$BASE_DIR/nginx/dograh_upstream.conf.template"
|
||||
NGINX_UPSTREAM_CONF="/etc/nginx/conf.d/dograh_upstream.conf"
|
||||
|
||||
###############################################################################
|
||||
### 5) Run migrations
|
||||
### 4) Run migrations
|
||||
###############################################################################
|
||||
|
||||
alembic -c "$BASE_DIR/api/alembic.ini" upgrade head
|
||||
|
||||
###############################################################################
|
||||
### 6) Prepare logs
|
||||
### 7) Prepare logs
|
||||
###############################################################################
|
||||
|
||||
mkdir -p "$BASE_LOG_DIR" "$LOG_DIR"
|
||||
|
|
@ -232,7 +129,7 @@ echo "Log directory: $LOG_DIR"
|
|||
echo "Latest symlink: $LATEST_LINK -> $TIMESTAMP"
|
||||
|
||||
###############################################################################
|
||||
### 7) Start services
|
||||
### 8) Start services
|
||||
###############################################################################
|
||||
|
||||
for i in "${!SERVICE_NAMES[@]}"; do
|
||||
|
|
@ -255,22 +152,47 @@ for i in "${!SERVICE_NAMES[@]}"; do
|
|||
echo $pid >"$RUN_DIR/$name.pid"
|
||||
echo " Started with PID $pid"
|
||||
|
||||
if [[ "$name" == "uvicorn" ]]; then
|
||||
echo "$FASTAPI_PORT" >"$RUN_DIR/uvicorn.port"
|
||||
fi
|
||||
done
|
||||
|
||||
# Cold start always uses band A (for rolling_update.sh dual-band strategy)
|
||||
echo "A" > "$RUN_DIR/active_band"
|
||||
|
||||
###############################################################################
|
||||
### 8) Summary
|
||||
### 8) Generate nginx upstream config & reload
|
||||
###############################################################################
|
||||
|
||||
if [[ -f "$NGINX_UPSTREAM_TEMPLATE" ]]; then
|
||||
# Build upstream server list from worker ports
|
||||
UPSTREAM_SERVERS=""
|
||||
for ((w=0; w<FASTAPI_WORKERS; w++)); do
|
||||
port=$((UVICORN_BASE_PORT + w))
|
||||
UPSTREAM_SERVERS="${UPSTREAM_SERVERS} server 127.0.0.1:${port};\n"
|
||||
done
|
||||
|
||||
# Generate upstream config from template
|
||||
sed -e "s|{{UVICORN_UPSTREAM_SERVERS}}|${UPSTREAM_SERVERS}|" \
|
||||
"$NGINX_UPSTREAM_TEMPLATE" | sudo tee "$NGINX_UPSTREAM_CONF" > /dev/null
|
||||
|
||||
echo "Generated nginx upstream config with $FASTAPI_WORKERS workers (ports ${UVICORN_BASE_PORT}-$((UVICORN_BASE_PORT + FASTAPI_WORKERS - 1)))"
|
||||
|
||||
# Test and reload nginx
|
||||
if sudo nginx -t 2>/dev/null; then
|
||||
sudo systemctl reload nginx
|
||||
echo "Nginx reloaded successfully"
|
||||
else
|
||||
echo "ERROR: nginx config test failed, not reloading"
|
||||
sudo nginx -t
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
###############################################################################
|
||||
### 9) Summary
|
||||
###############################################################################
|
||||
|
||||
echo
|
||||
echo "──────────────────────────────────────────────────"
|
||||
if $DEV_MODE; then
|
||||
echo "Mode: DEVELOPMENT (auto-reload enabled)"
|
||||
else
|
||||
echo "Mode: PRODUCTION"
|
||||
fi
|
||||
echo "Mode: PRODUCTION"
|
||||
echo ""
|
||||
for name in "${SERVICE_NAMES[@]}"; do
|
||||
pid=$(<"$RUN_DIR/$name.pid")
|
||||
|
|
@ -284,8 +206,3 @@ echo "Logs: tail -f $LOG_DIR/*.log"
|
|||
echo "Rotated logs: ls $LOG_DIR/*.log.*"
|
||||
echo "To stop: ./scripts/stop_services.sh"
|
||||
echo "──────────────────────────────────────────────────"
|
||||
|
||||
# In Docker mode, wait for all background processes to keep container alive
|
||||
if [[ "$WAIT_FOR_PROCESSES" == "true" ]]; then
|
||||
wait
|
||||
fi
|
||||
|
|
|
|||
219
scripts/start_services_dev.sh
Executable file
219
scripts/start_services_dev.sh
Executable file
|
|
@ -0,0 +1,219 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e # Exit on error
|
||||
|
||||
###############################################################################
|
||||
### CONFIGURATION
|
||||
###############################################################################
|
||||
|
||||
# Determine BASE_DIR as parent of the scripts directory
|
||||
BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)"
|
||||
|
||||
ENV_FILE="$BASE_DIR/api/.env"
|
||||
RUN_DIR="$BASE_DIR/run" # Where we keep *.pid
|
||||
BASE_LOG_DIR="$BASE_DIR/logs" # Base logs directory
|
||||
|
||||
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
||||
LOG_DIR="$BASE_LOG_DIR/$TIMESTAMP" # Timestamped log directory
|
||||
LATEST_LINK="$BASE_LOG_DIR/latest" # Symlink to latest logs
|
||||
VENV_PATH="$BASE_DIR/venv"
|
||||
|
||||
ARQ_WORKERS=${ARQ_WORKERS:-1}
|
||||
LOG_TO_FILE=${LOG_TO_FILE:-true}
|
||||
|
||||
cd "$BASE_DIR"
|
||||
echo "Starting Dograh Services (DEV MODE) at $(date) in BASE_DIR: ${BASE_DIR}"
|
||||
echo "Auto-reload enabled for api/ directory changes"
|
||||
|
||||
###############################################################################
|
||||
### 1) Load environment variables
|
||||
###############################################################################
|
||||
|
||||
if [[ -f "$ENV_FILE" ]]; then
|
||||
set -a && . "$ENV_FILE" && set +a
|
||||
fi
|
||||
|
||||
UVICORN_BASE_PORT=${UVICORN_BASE_PORT:-8000}
|
||||
|
||||
###############################################################################
|
||||
### 2) Define services
|
||||
###############################################################################
|
||||
|
||||
SERVICE_NAMES=(
|
||||
"ari_manager"
|
||||
"campaign_orchestrator"
|
||||
"uvicorn"
|
||||
)
|
||||
|
||||
SERVICE_COMMANDS=(
|
||||
"python -m api.services.telephony.ari_manager"
|
||||
"python -m api.services.campaign.campaign_orchestrator"
|
||||
"uvicorn api.app:app --host 0.0.0.0 --port $UVICORN_BASE_PORT --reload --reload-dir api"
|
||||
)
|
||||
|
||||
# Add ARQ workers dynamically
|
||||
for ((i=1; i<=ARQ_WORKERS; i++)); do
|
||||
SERVICE_NAMES+=("arq$i")
|
||||
SERVICE_COMMANDS+=("python -m arq api.tasks.arq.WorkerSettings --custom-log-dict api.tasks.arq.LOG_CONFIG")
|
||||
done
|
||||
|
||||
###############################################################################
|
||||
### 3) Activate virtual environment
|
||||
###############################################################################
|
||||
|
||||
if [[ -d "$VENV_PATH" && -f "$VENV_PATH/bin/activate" ]]; then
|
||||
source "$VENV_PATH/bin/activate"
|
||||
echo "Virtual environment activated: $VENV_PATH"
|
||||
else
|
||||
echo "Warning: Virtual environment not found at $VENV_PATH"
|
||||
echo "Continuing without virtual environment activation..."
|
||||
fi
|
||||
|
||||
###############################################################################
|
||||
### 4) Stop old services
|
||||
###############################################################################
|
||||
|
||||
mkdir -p "$RUN_DIR"
|
||||
|
||||
# Function to get all descendant PIDs of a process (children, grandchildren, etc.)
|
||||
get_descendants() {
|
||||
local parent_pid=$1
|
||||
local descendants=""
|
||||
local children
|
||||
|
||||
# Get direct children
|
||||
children=$(pgrep -P "$parent_pid" 2>/dev/null || true)
|
||||
|
||||
for child in $children; do
|
||||
# Recursively get descendants of each child
|
||||
descendants="$descendants $child $(get_descendants "$child")"
|
||||
done
|
||||
|
||||
echo "$descendants"
|
||||
}
|
||||
|
||||
# Function to kill a process and all its descendants
|
||||
kill_process_tree() {
|
||||
local pid=$1
|
||||
local signal=$2
|
||||
local descendants
|
||||
|
||||
descendants=$(get_descendants "$pid")
|
||||
|
||||
# Kill children first (bottom-up), then parent
|
||||
for desc_pid in $descendants; do
|
||||
if kill -0 "$desc_pid" 2>/dev/null; then
|
||||
kill "$signal" "$desc_pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Kill the parent
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$signal" "$pid" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
for name in "${SERVICE_NAMES[@]}"; do
|
||||
pidfile="$RUN_DIR/$name.pid"
|
||||
|
||||
if [[ -f $pidfile ]]; then
|
||||
oldpid=$(<"$pidfile")
|
||||
|
||||
if kill -0 "$oldpid" 2>/dev/null; then
|
||||
echo "Stopping $name (PID $oldpid and all descendants)…"
|
||||
|
||||
kill_process_tree "$oldpid" "-TERM"
|
||||
sleep 4
|
||||
|
||||
still_alive=false
|
||||
if kill -0 "$oldpid" 2>/dev/null; then
|
||||
still_alive=true
|
||||
else
|
||||
for desc_pid in $(get_descendants "$oldpid"); do
|
||||
if kill -0 "$desc_pid" 2>/dev/null; then
|
||||
still_alive=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if $still_alive; then
|
||||
echo "⚠️ $name did not exit cleanly, forcing stop..."
|
||||
kill_process_tree "$oldpid" "-KILL"
|
||||
sleep 1
|
||||
fi
|
||||
fi
|
||||
|
||||
rm -f "$pidfile"
|
||||
else
|
||||
echo "No PID file for $name, skipping stop."
|
||||
fi
|
||||
done
|
||||
|
||||
# Clean up legacy port tracking files
|
||||
rm -f "$RUN_DIR/uvicorn.port" "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn_old.pid" "$RUN_DIR/active_band"
|
||||
|
||||
###############################################################################
|
||||
### 5) Run migrations
|
||||
###############################################################################
|
||||
|
||||
alembic -c "$BASE_DIR/api/alembic.ini" upgrade head
|
||||
|
||||
###############################################################################
|
||||
### 6) Prepare logs
|
||||
###############################################################################
|
||||
|
||||
mkdir -p "$BASE_LOG_DIR" "$LOG_DIR"
|
||||
|
||||
if [[ -L "$LATEST_LINK" ]]; then
|
||||
rm "$LATEST_LINK"
|
||||
fi
|
||||
ln -s "$TIMESTAMP" "$LATEST_LINK"
|
||||
|
||||
echo "Log directory: $LOG_DIR"
|
||||
echo "Latest symlink: $LATEST_LINK -> $TIMESTAMP"
|
||||
|
||||
###############################################################################
|
||||
### 7) Start services
|
||||
###############################################################################
|
||||
|
||||
for i in "${!SERVICE_NAMES[@]}"; do
|
||||
name="${SERVICE_NAMES[$i]}"
|
||||
cmd="${SERVICE_COMMANDS[$i]}"
|
||||
echo "→ Starting $name"
|
||||
|
||||
(
|
||||
cd "$BASE_DIR"
|
||||
if [[ "$LOG_TO_FILE" == "true" ]]; then
|
||||
export LOG_FILE_PATH="$LOG_DIR/$name.log"
|
||||
exec $cmd >>"$LOG_DIR/$name.log" 2>&1
|
||||
else
|
||||
exec $cmd
|
||||
fi
|
||||
) &
|
||||
|
||||
pid=$!
|
||||
echo $pid >"$RUN_DIR/$name.pid"
|
||||
echo " Started with PID $pid"
|
||||
|
||||
done
|
||||
|
||||
###############################################################################
|
||||
### 8) Summary
|
||||
###############################################################################
|
||||
|
||||
echo
|
||||
echo "──────────────────────────────────────────────────"
|
||||
echo "Mode: DEVELOPMENT (auto-reload enabled)"
|
||||
echo ""
|
||||
for name in "${SERVICE_NAMES[@]}"; do
|
||||
pid=$(<"$RUN_DIR/$name.pid")
|
||||
echo "✓ $name (PID $pid) → $LOG_DIR/$name.log"
|
||||
done
|
||||
echo ""
|
||||
echo " Rotation: ${LOG_ROTATION_SIZE:-100 MB}"
|
||||
echo " Retention: ${LOG_RETENTION:-7 days}"
|
||||
echo " Compression: ${LOG_COMPRESSION:-gz}"
|
||||
echo "Logs: tail -f $LOG_DIR/*.log"
|
||||
echo "Rotated logs: ls $LOG_DIR/*.log.*"
|
||||
echo "To stop: ./scripts/stop_services.sh"
|
||||
echo "──────────────────────────────────────────────────"
|
||||
|
|
@ -42,17 +42,17 @@ kill_process_tree() {
|
|||
|
||||
descendants=$(get_descendants "$pid")
|
||||
|
||||
# Kill children first (bottom-up), then parent
|
||||
# Kill the parent first so supervisors don't respawn children
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$signal" "$pid" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Then kill any remaining descendants
|
||||
for desc_pid in $descendants; do
|
||||
if kill -0 "$desc_pid" 2>/dev/null; then
|
||||
kill "$signal" "$desc_pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Kill the parent
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$signal" "$pid" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
|
|
@ -113,14 +113,14 @@ for pidfile in "${pid_files[@]}"; do
|
|||
# Final check
|
||||
if kill -0 "$oldpid" 2>/dev/null; then
|
||||
echo " Error: Failed to stop $name (PID $oldpid)"
|
||||
((failed_count++))
|
||||
failed_count=$((failed_count + 1))
|
||||
else
|
||||
echo " Stopped $name (forced)"
|
||||
((stopped_count++))
|
||||
stopped_count=$((stopped_count + 1))
|
||||
fi
|
||||
else
|
||||
echo " Stopped $name"
|
||||
((stopped_count++))
|
||||
stopped_count=$((stopped_count + 1))
|
||||
fi
|
||||
else
|
||||
echo "Service $name (PID $oldpid) is not running"
|
||||
|
|
@ -130,8 +130,8 @@ for pidfile in "${pid_files[@]}"; do
|
|||
fi
|
||||
done
|
||||
|
||||
# Clean up any port tracking files for uvicorn
|
||||
rm -f "$RUN_DIR/uvicorn.port" "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn_old.pid"
|
||||
# Clean up any port tracking files for uvicorn and band tracking
|
||||
rm -f "$RUN_DIR/uvicorn.port" "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn_old.pid" "$RUN_DIR/active_band"
|
||||
|
||||
###############################################################################
|
||||
### SUMMARY
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue