diff --git a/.gitignore b/.gitignore index e816974..4a8410f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,10 +8,10 @@ __pycache__ /logs/ /run/ infrastructure/ -nginx/ prd/ .vercel venv/ .venv/ .playwright-mcp +coturn/ \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index e18201e..ac941b6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -29,7 +29,7 @@ dograh/ ```bash # Start infrastructure services (postgres, redis, minio) -./scripts/start_services.sh --dev +./scripts/start_services_dev.sh # Stop all services ./scripts/stop_services.sh diff --git a/api/Dockerfile b/api/Dockerfile index 91078f9..834462a 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -59,17 +59,15 @@ ENV PYTHONUNBUFFERED=1 # Copy application code COPY ./api ./api -COPY ./scripts/start_services.sh ./scripts/start_services.sh +COPY ./scripts/start_services_dev.sh ./scripts/start_services_dev.sh ENV PYTHONPATH=/app # Disable file logging in Docker - logs go to stdout for docker logs ENV LOG_TO_FILE=false -# Keep container alive by waiting for background processes -ENV WAIT_FOR_PROCESSES=true # Expose the port FastAPI will run on EXPOSE 8000 # Run the FastAPI app with uvicorn -CMD ["./scripts/start_services.sh"] \ No newline at end of file +CMD ["./scripts/start_services_dev.sh"] \ No newline at end of file diff --git a/docs/contribution/setup.mdx b/docs/contribution/setup.mdx index b638ae4..dd30c3b 100644 --- a/docs/contribution/setup.mdx +++ b/docs/contribution/setup.mdx @@ -59,7 +59,7 @@ bash scripts/setup_pipecat.sh ``` 10. Start backend services ``` -bash scripts/start_services.sh --dev +bash scripts/start_services_dev.sh ``` Verify that your backend server is running ``` diff --git a/nginx/dograh_upstream.conf.template b/nginx/dograh_upstream.conf.template new file mode 100644 index 0000000..8177be9 --- /dev/null +++ b/nginx/dograh_upstream.conf.template @@ -0,0 +1,7 @@ +# AUTO-GENERATED by start_services.sh / rolling_update.sh +# Do not edit the generated file directly - edit this template instead. +upstream dograh_backend { + least_conn; + +{{UVICORN_UPSTREAM_SERVERS}} +} diff --git a/scripts/rolling_update.sh b/scripts/rolling_update.sh new file mode 100755 index 0000000..b98bb77 --- /dev/null +++ b/scripts/rolling_update.sh @@ -0,0 +1,471 @@ +#!/usr/bin/env bash +# rolling_update.sh — Zero-downtime rolling update using dual-band port strategy +# +# Usage: +# ./scripts/rolling_update.sh +# DRAIN_TIMEOUT=600 ./scripts/rolling_update.sh +# +# Old workers drain active calls (WebSocket/WebRTC) before shutting down. +# Nginx switches to new workers only after every one passes health checks. +# On failure at any phase, the script rolls back: kills new workers, leaves +# old workers and nginx untouched. + +set -euo pipefail + +############################################################################### +### CONFIGURATION +############################################################################### + +BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)" + +ENV_FILE="$BASE_DIR/api/.env" +RUN_DIR="$BASE_DIR/run" +BASE_LOG_DIR="$BASE_DIR/logs" +LATEST_LINK="$BASE_LOG_DIR/latest" +VENV_PATH="$BASE_DIR/venv" + +NGINX_UPSTREAM_TEMPLATE="$BASE_DIR/nginx/dograh_upstream.conf.template" +NGINX_UPSTREAM_CONF="/etc/nginx/conf.d/dograh_upstream.conf" + +HEALTH_CHECK_ENDPOINT="/api/v1/health" + +# Load environment +if [[ -f "$ENV_FILE" ]]; then + set -a && . "$ENV_FILE" && set +a +fi + +UVICORN_BASE_PORT=${UVICORN_BASE_PORT:-8000} +CPU_CORES=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1) +FASTAPI_WORKERS=${FASTAPI_WORKERS:-$CPU_CORES} +ARQ_WORKERS=${ARQ_WORKERS:-1} + +# Tuning knobs (override via environment) +DRAIN_TIMEOUT=${DRAIN_TIMEOUT:-300} # seconds to wait for old workers to drain +HEALTH_MAX_ATTEMPTS=${HEALTH_MAX_ATTEMPTS:-30} # per-worker health-check retries +HEALTH_INTERVAL=${HEALTH_INTERVAL:-2} # seconds between health-check retries + +cd "$BASE_DIR" + +############################################################################### +### HELPERS +############################################################################### + +log_info() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: $*"; } +log_warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] WARN: $*"; } +log_error() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2; } + +# Band port calculation: band A = base, band B = base + 100 +band_base_port() { + local band=$1 + if [[ "$band" == "A" ]]; then + echo "$UVICORN_BASE_PORT" + else + echo $((UVICORN_BASE_PORT + 100)) + fi +} + +opposite_band() { + if [[ "$1" == "A" ]]; then echo "B"; else echo "A"; fi +} + +# Get all descendant PIDs of a process +get_descendants() { + local parent_pid=$1 + local descendants="" + local children + children=$(pgrep -P "$parent_pid" 2>/dev/null || true) + for child in $children; do + descendants="$descendants $child $(get_descendants "$child")" + done + echo "$descendants" +} + +# Kill a process and all its descendants +kill_process_tree() { + local pid=$1 + local signal=$2 + local descendants + descendants=$(get_descendants "$pid") + for desc_pid in $descendants; do + if kill -0 "$desc_pid" 2>/dev/null; then + kill "$signal" "$desc_pid" 2>/dev/null || true + fi + done + if kill -0 "$pid" 2>/dev/null; then + kill "$signal" "$pid" 2>/dev/null || true + fi +} + +############################################################################### +### ROLLBACK +############################################################################### + +# Kill all new-band workers and leave old workers + nginx untouched +rollback_new_workers() { + local new_band=$1 + local new_base + new_base=$(band_base_port "$new_band") + + log_error "ROLLING BACK — killing new band $new_band workers" + + for ((w = 0; w < FASTAPI_WORKERS; w++)); do + local port=$((new_base + w)) + local pidfile="$RUN_DIR/uvicorn_${port}.pid" + if [[ -f "$pidfile" ]]; then + local pid + pid=$(<"$pidfile") + if kill -0 "$pid" 2>/dev/null; then + kill_process_tree "$pid" "-KILL" + log_info " Killed uvicorn_${port} (PID $pid)" + fi + rm -f "$pidfile" + fi + done + + log_error "Rollback complete. Old workers and nginx are untouched." +} + +############################################################################### +### PHASE 0: PRE-FLIGHT CHECKS +############################################################################### + +log_info "=== Phase 0: Pre-flight checks ===" + +# Determine current and new band +if [[ -f "$RUN_DIR/active_band" ]]; then + OLD_BAND=$(<"$RUN_DIR/active_band") +else + log_error "No active_band file found in $RUN_DIR. Run start_services.sh first." + exit 1 +fi + +NEW_BAND=$(opposite_band "$OLD_BAND") +OLD_BASE=$(band_base_port "$OLD_BAND") +NEW_BASE=$(band_base_port "$NEW_BAND") + +log_info "Current band: $OLD_BAND (ports ${OLD_BASE}–$((OLD_BASE + FASTAPI_WORKERS - 1)))" +log_info "New band: $NEW_BAND (ports ${NEW_BASE}–$((NEW_BASE + FASTAPI_WORKERS - 1)))" + +# Verify at least one old worker is running +old_running=0 +for ((w = 0; w < FASTAPI_WORKERS; w++)); do + port=$((OLD_BASE + w)) + pidfile="$RUN_DIR/uvicorn_${port}.pid" + if [[ -f "$pidfile" ]]; then + pid=$(<"$pidfile") + if kill -0 "$pid" 2>/dev/null; then + old_running=$((old_running + 1)) + fi + fi +done + +if [[ $old_running -eq 0 ]]; then + log_error "No old workers are running. Use start_services.sh for a cold start." + exit 1 +fi +log_info "Found $old_running running old worker(s)" + +# Verify new ports are free +for ((w = 0; w < FASTAPI_WORKERS; w++)); do + port=$((NEW_BASE + w)) + if ss -tln "sport = :$port" | grep -q LISTEN; then + log_error "Port $port is already in use. Cannot start new band." + exit 1 + fi +done +log_info "All new-band ports are free" + +# Verify nginx is running +if ! pgrep -x nginx >/dev/null 2>&1; then + log_error "nginx is not running." + exit 1 +fi +log_info "nginx is running" + +############################################################################### +### PHASE 1: RUN MIGRATIONS +############################################################################### + +log_info "=== Phase 1: Running Alembic migrations ===" + +# Activate virtual environment +if [[ -d "$VENV_PATH" && -f "$VENV_PATH/bin/activate" ]]; then + source "$VENV_PATH/bin/activate" +else + log_warn "No virtual environment at $VENV_PATH, continuing without" +fi + +if ! alembic -c "$BASE_DIR/api/alembic.ini" upgrade head; then + log_error "Alembic migration failed. Aborting — nothing has been touched." + exit 1 +fi +log_info "Migrations complete" + +############################################################################### +### PHASE 2: START NEW WORKERS +############################################################################### + +log_info "=== Phase 2: Starting new workers on band $NEW_BAND ===" + +# Resolve log directory +if [[ -L "$LATEST_LINK" && -d "$LATEST_LINK" ]]; then + LOG_DIR="$BASE_LOG_DIR/$(readlink "$LATEST_LINK")" +else + # Create a new timestamped log dir for this deploy + TIMESTAMP=$(date +"%Y%m%d_%H%M%S") + LOG_DIR="$BASE_LOG_DIR/$TIMESTAMP" + mkdir -p "$LOG_DIR" + rm -f "$LATEST_LINK" + ln -s "$TIMESTAMP" "$LATEST_LINK" +fi + +mkdir -p "$RUN_DIR" + +for ((w = 0; w < FASTAPI_WORKERS; w++)); do + port=$((NEW_BASE + w)) + name="uvicorn_${port}" + log_info " Starting $name on port $port" + + ( + cd "$BASE_DIR" + export LOG_FILE_PATH="$LOG_DIR/${name}.log" + exec uvicorn api.app:app --host 127.0.0.1 --port "$port" \ + >>"$LOG_DIR/${name}.log" 2>&1 + ) & + + pid=$! + echo "$pid" > "$RUN_DIR/${name}.pid" + log_info " PID $pid" +done + +# Brief pause to let workers bind +sleep 3 + +# Quick sanity: make sure they haven't crashed immediately +for ((w = 0; w < FASTAPI_WORKERS; w++)); do + port=$((NEW_BASE + w)) + pid=$(<"$RUN_DIR/uvicorn_${port}.pid") + if ! kill -0 "$pid" 2>/dev/null; then + log_error "Worker uvicorn_${port} (PID $pid) died immediately" + rollback_new_workers "$NEW_BAND" + exit 1 + fi +done + +log_info "All $FASTAPI_WORKERS new workers started" + +############################################################################### +### PHASE 3: HEALTH-CHECK EVERY NEW WORKER +############################################################################### + +log_info "=== Phase 3: Health-checking new workers ===" + +all_healthy=true +for ((w = 0; w < FASTAPI_WORKERS; w++)); do + port=$((NEW_BASE + w)) + healthy=false + + for ((attempt = 1; attempt <= HEALTH_MAX_ATTEMPTS; attempt++)); do + http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + "http://127.0.0.1:${port}${HEALTH_CHECK_ENDPOINT}" 2>/dev/null || echo "000") + + if [[ "$http_code" == "200" ]]; then + log_info " uvicorn_${port} healthy (attempt $attempt)" + healthy=true + break + fi + sleep "$HEALTH_INTERVAL" + done + + if ! $healthy; then + log_error " uvicorn_${port} FAILED health check after $HEALTH_MAX_ATTEMPTS attempts" + all_healthy=false + break + fi +done + +if ! $all_healthy; then + rollback_new_workers "$NEW_BAND" + exit 1 +fi + +log_info "All new workers are healthy" + +############################################################################### +### PHASE 4: SWITCH NGINX TO NEW BAND +############################################################################### + +log_info "=== Phase 4: Switching nginx to band $NEW_BAND ===" + +if [[ ! -f "$NGINX_UPSTREAM_TEMPLATE" ]]; then + log_error "Nginx upstream template not found at $NGINX_UPSTREAM_TEMPLATE" + rollback_new_workers "$NEW_BAND" + exit 1 +fi + +# Build upstream server list from new-band ports +UPSTREAM_SERVERS="" +for ((w = 0; w < FASTAPI_WORKERS; w++)); do + port=$((NEW_BASE + w)) + UPSTREAM_SERVERS="${UPSTREAM_SERVERS} server 127.0.0.1:${port};\n" +done + +# Generate upstream config +sed -e "s|{{UVICORN_UPSTREAM_SERVERS}}|${UPSTREAM_SERVERS}|" \ + "$NGINX_UPSTREAM_TEMPLATE" | sudo tee "$NGINX_UPSTREAM_CONF" > /dev/null + +log_info "Generated nginx upstream config with $FASTAPI_WORKERS workers (ports ${NEW_BASE}–$((NEW_BASE + FASTAPI_WORKERS - 1)))" + +# Validate config +if ! sudo nginx -t 2>/dev/null; then + log_error "nginx config validation failed!" + sudo nginx -t 2>&1 || true + # Restore old upstream config + OLD_UPSTREAM="" + for ((w = 0; w < FASTAPI_WORKERS; w++)); do + port=$((OLD_BASE + w)) + OLD_UPSTREAM="${OLD_UPSTREAM} server 127.0.0.1:${port};\n" + done + sed -e "s|{{UVICORN_UPSTREAM_SERVERS}}|${OLD_UPSTREAM}|" \ + "$NGINX_UPSTREAM_TEMPLATE" | sudo tee "$NGINX_UPSTREAM_CONF" > /dev/null + + rollback_new_workers "$NEW_BAND" + exit 1 +fi + +# Reload nginx (graceful — finishes in-flight requests to old upstream) +sudo systemctl reload nginx +log_info "nginx reloaded — traffic now routed to band $NEW_BAND" + +############################################################################### +### PHASE 5: DRAIN OLD WORKERS +############################################################################### + +log_info "=== Phase 5: Draining old workers (band $OLD_BAND, timeout ${DRAIN_TIMEOUT}s) ===" + +# Collect old worker PIDs +OLD_PIDS=() +for ((w = 0; w < FASTAPI_WORKERS; w++)); do + port=$((OLD_BASE + w)) + pidfile="$RUN_DIR/uvicorn_${port}.pid" + if [[ -f "$pidfile" ]]; then + pid=$(<"$pidfile") + if kill -0 "$pid" 2>/dev/null; then + OLD_PIDS+=("$pid") + log_info " Sending SIGTERM to uvicorn_${port} (PID $pid)" + kill_process_tree "$pid" "-TERM" + fi + rm -f "$pidfile" + fi +done + +if [[ ${#OLD_PIDS[@]} -gt 0 ]]; then + start_time=$(date +%s) + + while true; do + all_dead=true + for pid in "${OLD_PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + all_dead=false + break + fi + done + + if $all_dead; then + log_info "All old workers exited gracefully" + break + fi + + elapsed=$(( $(date +%s) - start_time )) + if [[ $elapsed -ge $DRAIN_TIMEOUT ]]; then + log_warn "Drain timeout reached (${DRAIN_TIMEOUT}s). Force-killing remaining old workers." + for pid in "${OLD_PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + kill_process_tree "$pid" "-KILL" + log_warn " Force-killed PID $pid" + fi + done + sleep 1 + break + fi + + log_info " Waiting for old workers to drain... (${elapsed}s / ${DRAIN_TIMEOUT}s)" + sleep 5 + done +else + log_warn "No old worker PIDs to drain" +fi + +############################################################################### +### PHASE 6: RESTART NON-HTTP SERVICES +############################################################################### + +log_info "=== Phase 6: Restarting non-HTTP services ===" + +# Services to restart (same as start_services.sh) +RESTART_NAMES=( + "ari_manager" + "campaign_orchestrator" +) +RESTART_COMMANDS=( + "python -m api.services.telephony.ari_manager" + "python -m api.services.campaign.campaign_orchestrator" +) + +# Add ARQ workers +for ((i = 1; i <= ARQ_WORKERS; i++)); do + RESTART_NAMES+=("arq$i") + RESTART_COMMANDS+=("python -m arq api.tasks.arq.WorkerSettings --custom-log-dict api.tasks.arq.LOG_CONFIG") +done + +for i in "${!RESTART_NAMES[@]}"; do + name="${RESTART_NAMES[$i]}" + cmd="${RESTART_COMMANDS[$i]}" + pidfile="$RUN_DIR/${name}.pid" + + # Stop old instance + if [[ -f "$pidfile" ]]; then + oldpid=$(<"$pidfile") + if kill -0 "$oldpid" 2>/dev/null; then + log_info " Stopping $name (PID $oldpid)" + kill_process_tree "$oldpid" "-TERM" + sleep 2 + if kill -0 "$oldpid" 2>/dev/null; then + kill_process_tree "$oldpid" "-KILL" + sleep 1 + fi + fi + rm -f "$pidfile" + fi + + # Start new instance + log_info " Starting $name" + ( + cd "$BASE_DIR" + export LOG_FILE_PATH="$LOG_DIR/${name}.log" + exec $cmd >>"$LOG_DIR/${name}.log" 2>&1 + ) & + + pid=$! + echo "$pid" > "$RUN_DIR/${name}.pid" + log_info " PID $pid" +done + +############################################################################### +### PHASE 7: FINALIZE +############################################################################### + +log_info "=== Phase 7: Finalize ===" + +echo "$NEW_BAND" > "$RUN_DIR/active_band" +log_info "active_band set to $NEW_BAND" + +echo +echo "══════════════════════════════════════════════════" +echo " Rolling update completed successfully" +echo "" +echo " Band: $OLD_BAND → $NEW_BAND" +echo " Workers: $FASTAPI_WORKERS (ports ${NEW_BASE}–$((NEW_BASE + FASTAPI_WORKERS - 1)))" +echo " Services: ${RESTART_NAMES[*]}" +echo " Logs: $LOG_DIR" +echo "══════════════════════════════════════════════════" diff --git a/scripts/rolling_update_uvicorn.sh b/scripts/rolling_update_uvicorn.sh deleted file mode 100755 index 357a905..0000000 --- a/scripts/rolling_update_uvicorn.sh +++ /dev/null @@ -1,324 +0,0 @@ -#!/usr/bin/env bash -# rolling_update_uvicorn.sh — Zero-downtime rolling update for uvicorn workers - -set -e # Exit on error - -### CONFIGURATION ############################################################# - -# Determine BASE_DIR as parent of the scripts directory -BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)" - -ENV_FILE="$BASE_DIR/api/.env" -RUN_DIR="$BASE_DIR/run" -BASE_LOG_DIR="$BASE_DIR/logs" # Base logs directory (same as start_services.sh) -LATEST_LINK="$BASE_LOG_DIR/latest" # Symlink to latest logs (same as start_services.sh) -VENV_PATH="$BASE_DIR/venv" -HEALTH_CHECK_ENDPOINT="/api/v1/health" # Adjust as needed -MAX_WAIT_SECONDS=310 # Max wait for graceful shutdown (5 minutes + 10 seconds grace) - -# Load environment -set -a && . "$ENV_FILE" && set +a - -cd "$BASE_DIR" - -### FUNCTIONS ################################################################## - -log_info() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO: $*" -} - -log_error() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2 -} - -log_warning() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] WARN: $*" -} - -check_port_availability() { - local port=$1 - if lsof -Pi :$port -sTCP:LISTEN -t >/dev/null 2>&1; then - return 1 # Port is in use - fi - return 0 # Port is available -} - -wait_for_health_check() { - local port=$1 - local max_attempts=30 - local attempt=0 - - log_info "Waiting for new uvicorn workers to be healthy on port $port..." - - while [ $attempt -lt $max_attempts ]; do - if curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:${port}${HEALTH_CHECK_ENDPOINT}" | grep -q "200"; then - log_info "Health check passed on port $port" - return 0 - fi - attempt=$((attempt + 1)) - log_info "Health check attempt $attempt/$max_attempts..." - sleep 1 - done - - log_error "Health check failed after $max_attempts attempts" - return 1 -} - -get_old_uvicorn_pids() { - local pidfile="$RUN_DIR/uvicorn.pid" - local pids="" - - if [[ -f "$pidfile" ]]; then - # Read the main PID - local main_pid=$(<"$pidfile") - if kill -0 "$main_pid" 2>/dev/null; then - # Get all PIDs in the process group - pids=$(ps -o pid= -g $(ps -o pgid= -p "$main_pid" | tr -d ' ') 2>/dev/null || echo "$main_pid") - fi - fi - - echo "$pids" -} - -graceful_shutdown_old_workers() { - local old_pids="$1" - - if [[ -z "$old_pids" ]]; then - log_warning "No old uvicorn workers found to shut down" - return 0 - fi - - log_info "Starting graceful shutdown of old uvicorn workers (PIDs: $(echo $old_pids | tr '\n' ' '))" - - # Send SIGTERM to trigger graceful shutdown - for pid in $old_pids; do - if kill -0 "$pid" 2>/dev/null; then - log_info "Sending SIGTERM to PID $pid" - kill -TERM "$pid" 2>/dev/null || true - fi - done - - # Wait for processes to exit gracefully - local start_time=$(date +%s) - local all_dead=false - - while [[ $(($(date +%s) - start_time)) -lt $MAX_WAIT_SECONDS ]]; do - all_dead=true - for pid in $old_pids; do - if kill -0 "$pid" 2>/dev/null; then - all_dead=false - break - fi - done - - if $all_dead; then - log_info "All old workers shut down gracefully" - return 0 - fi - - log_info "Waiting for workers to complete active requests... ($(( $(date +%s) - start_time ))s elapsed)" - sleep 5 - done - - # Force kill if still running after timeout - log_warning "Timeout reached, force killing remaining workers" - for pid in $old_pids; do - if kill -0 "$pid" 2>/dev/null; then - log_warning "Force killing PID $pid" - kill -KILL "$pid" 2>/dev/null || true - fi - done - - sleep 1 - return 0 -} - -start_new_uvicorn_workers() { - local new_port=$1 - - log_info "Starting new uvicorn workers on port $new_port..." - - # Get configuration from environment - set -a && . "$ENV_FILE" && set +a - - if [[ -z "${FASTAPI_WORKERS:-}" ]]; then - log_error "FASTAPI_WORKERS environment variable is not set" - return 1 - fi - - # Activate virtual environment - source ${VENV_PATH}/bin/activate - - # Use the latest log directory created by start_services.sh - local log_dir="" - - # First, check if the symlink exists and points to a valid directory - if [[ -L "$LATEST_LINK" ]] && [[ -d "$LATEST_LINK" ]]; then - # Follow the symlink to get the actual directory - log_dir="$BASE_LOG_DIR/$(readlink "$LATEST_LINK")" - log_info "Using existing log directory: $log_dir" - else - log_error "No log directory found. Run start_services.sh first to create logs directory." - log_error "Expected symlink at: $LATEST_LINK" - return 1 - fi - - # Create unique log filename using timestamp and script PID to avoid conflicts - local script_pid=$$ # PID of this rolling_update script (for uniqueness) - local timestamp=$(date '+%H%M%S') - export LOG_FILE_PATH="$log_dir/uvicorn-rollover-${timestamp}-${script_pid}.log" - - log_info "Starting uvicorn with $FASTAPI_WORKERS workers on port $new_port" - log_info "Logs: $LOG_FILE_PATH" - - # Start in background (same pattern as start_services.sh) - ( - cd "$BASE_DIR" - export LOG_FILE_PATH="$log_dir/uvicorn-rollover-${timestamp}-${script_pid}.log" - exec uvicorn api.app:app --host 0.0.0.0 --port $new_port --workers $FASTAPI_WORKERS >>"$LOG_FILE_PATH" 2>&1 - ) & - - local new_pid=$! - echo "$new_pid" > "$RUN_DIR/uvicorn_new.pid" - - # Save port information - echo "$new_port" > "$RUN_DIR/uvicorn_new.port" - - log_info "New uvicorn started with PID $new_pid" - - # Wait a bit for startup - sleep 5 - - # Check if process is still running - if ! kill -0 "$new_pid" 2>/dev/null; then - log_error "New uvicorn process died immediately" - return 1 - fi - - return 0 -} - -finalize_rollover() { - log_info "Finalizing rollover..." - - # Move new PID file to main PID file - if [[ -f "$RUN_DIR/uvicorn_new.pid" ]]; then - mv "$RUN_DIR/uvicorn_new.pid" "$RUN_DIR/uvicorn.pid" - fi - - # Store the new port for reference - if [[ -f "$RUN_DIR/uvicorn_new.port" ]]; then - mv "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn.port" - fi - - # Clean up old PID file if it exists - rm -f "$RUN_DIR/uvicorn_old.pid" - - log_info "Rollover completed successfully" -} - -rollback() { - local old_port=$1 - local new_pid=$2 - - log_error "Rolling back due to failure..." - - # Kill new workers if they exist - if [[ -n "$new_pid" ]] && kill -0 "$new_pid" 2>/dev/null; then - log_info "Killing new uvicorn workers (PID: $new_pid)" - kill -KILL -"$new_pid" 2>/dev/null || kill -KILL "$new_pid" 2>/dev/null || true - fi - - # Clean up temporary files - rm -f "$RUN_DIR/uvicorn_new.pid" "$RUN_DIR/uvicorn_new.port" - - log_error "Rollback completed" -} - -### MAIN LOGIC ################################################################ - -# Check arguments -if [[ $# -ne 1 ]]; then - echo "Usage: $0 " - echo "Example: $0 8001" - exit 1 -fi - -NEW_PORT=$1 - -# Validate port number -if ! [[ "$NEW_PORT" =~ ^[0-9]+$ ]] || [ "$NEW_PORT" -lt 1 ] || [ "$NEW_PORT" -gt 65535 ]; then - log_error "Invalid port number: $NEW_PORT" - exit 1 -fi - -# Check if port is available -if ! check_port_availability "$NEW_PORT"; then - log_error "Port $NEW_PORT is already in use" - exit 1 -fi - -# Get old port from file or environment -OLD_PORT="" -if [[ -f "$RUN_DIR/uvicorn.port" ]]; then - OLD_PORT=$(<"$RUN_DIR/uvicorn.port") -elif [[ -f "$ENV_FILE" ]]; then - set -a && . "$ENV_FILE" && set +a - OLD_PORT="${FASTAPI_PORT:-}" -fi - -if [[ "$NEW_PORT" == "$OLD_PORT" ]]; then - log_error "New port is the same as old port ($NEW_PORT)" - exit 1 -fi - -log_info "Starting rolling update from port ${OLD_PORT:-unknown} to port $NEW_PORT" - -# Create run directory if it doesn't exist -mkdir -p "$RUN_DIR" - -# Get old uvicorn PIDs before starting new ones -OLD_PIDS=$(get_old_uvicorn_pids) -if [[ -n "$OLD_PIDS" ]]; then - # Save old PIDs for potential rollback - echo "$OLD_PIDS" > "$RUN_DIR/uvicorn_old.pid" - log_info "Found old uvicorn workers: $(echo $OLD_PIDS | tr '\n' ' ')" -else - log_warning "No existing uvicorn workers found" -fi - -# Start new uvicorn workers -if ! start_new_uvicorn_workers "$NEW_PORT"; then - log_error "Failed to start new uvicorn workers" - exit 1 -fi - -NEW_PID=$(<"$RUN_DIR/uvicorn_new.pid") - -# Wait for new workers to be healthy -if ! wait_for_health_check "$NEW_PORT"; then - log_error "New workers failed health check" - rollback "$OLD_PORT" "$NEW_PID" - exit 1 -fi - -# Give the system some time to stabilize before shutting down old workers -log_info "Waiting for system to stabilize..." -sleep 5 - -# Gracefully shutdown old workers -if [[ -n "$OLD_PIDS" ]]; then - graceful_shutdown_old_workers "$OLD_PIDS" -fi - -# Finalize the rollover -finalize_rollover - -# Summary -echo "──────────────────────────────────────────────────" -echo "✓ Rolling update completed successfully" -echo " Old port: ${OLD_PORT:-none}" -echo " New port: $NEW_PORT" -echo " New PID: $NEW_PID" -echo " Logs: $BASE_LOG_DIR/$LATEST_LINK/" -echo "──────────────────────────────────────────────────" \ No newline at end of file diff --git a/scripts/start_services.sh b/scripts/start_services.sh index c1c5b86..f4c1470 100755 --- a/scripts/start_services.sh +++ b/scripts/start_services.sh @@ -1,42 +1,6 @@ #!/usr/bin/env bash set -e # Exit on error -############################################################################### -### ARGUMENT PARSING -############################################################################### - -DEV_MODE=false - -show_help() { - echo "Usage: $0 [OPTIONS]" - echo "" - echo "Options:" - echo " --dev Enable development mode with auto-reload for API changes" - echo " --help Show this help message" - echo "" - echo "Examples:" - echo " $0 # Start in production mode" - echo " $0 --dev # Start in development mode with auto-reload" -} - -while [[ $# -gt 0 ]]; do - case $1 in - --dev) - DEV_MODE=true - shift - ;; - --help|-h) - show_help - exit 0 - ;; - *) - echo "Unknown option: $1" - show_help - exit 1 - ;; - esac -done - ############################################################################### ### CONFIGURATION ############################################################################### @@ -55,16 +19,10 @@ VENV_PATH="$BASE_DIR/venv" ARQ_WORKERS=${ARQ_WORKERS:-1} LOG_TO_FILE=${LOG_TO_FILE:-true} # Set to false in Docker to use stdout -WAIT_FOR_PROCESSES=${WAIT_FOR_PROCESSES:-false} # Set to true in Docker to keep container alive # Log startup cd "$BASE_DIR" -if $DEV_MODE; then - echo "Starting Dograh Services (DEV MODE) at $(date) in BASE_DIR: ${BASE_DIR}" - echo "Auto-reload enabled for api/ directory changes" -else - echo "Starting Dograh Services at $(date) in BASE_DIR: ${BASE_DIR}" -fi +echo "Starting Dograh Services at $(date) in BASE_DIR: ${BASE_DIR}" ############################################################################### ### 1) Load environment variables @@ -75,10 +33,34 @@ if [[ -f "$ENV_FILE" ]]; then set -a && . "$ENV_FILE" && set +a fi -FASTAPI_PORT=${FASTAPI_PORT:-8000} +UVICORN_BASE_PORT=${UVICORN_BASE_PORT:-8000} CPU_CORES=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1) FASTAPI_WORKERS=${FASTAPI_WORKERS:-$CPU_CORES} +############################################################################### +### 1b) Safety check — refuse to start over running services +############################################################################### + +if [[ -d "$RUN_DIR" ]]; then + live_count=0 + for pidfile in "$RUN_DIR"/*.pid; do + [[ -e "$pidfile" ]] || continue + pid=$(<"$pidfile") + if kill -0 "$pid" 2>/dev/null; then + live_count=$((live_count + 1)) + fi + done + + if [[ $live_count -gt 0 ]]; then + echo "ERROR: $live_count service(s) are still running." + echo "" + echo " Stop first: ./scripts/stop_services.sh" + echo " For a zero-downtime deploy, use: ./scripts/rolling_update.sh" + echo "" + exit 1 + fi +fi + ############################################################################### ### 2) Define services ############################################################################### @@ -88,24 +70,20 @@ FASTAPI_WORKERS=${FASTAPI_WORKERS:-$CPU_CORES} SERVICE_NAMES=( "ari_manager" "campaign_orchestrator" - "uvicorn" ) -# Build uvicorn command based on mode -if $DEV_MODE; then - # Dev mode: single worker with auto-reload (--reload is incompatible with --workers > 1) - UVICORN_CMD="uvicorn api.app:app --host 0.0.0.0 --port $FASTAPI_PORT --reload --reload-dir api" -else - # Production mode: multiple workers, no reload - UVICORN_CMD="uvicorn api.app:app --host 0.0.0.0 --port $FASTAPI_PORT --workers $FASTAPI_WORKERS" -fi - SERVICE_COMMANDS=( "python -m api.services.telephony.ari_manager" "python -m api.services.campaign.campaign_orchestrator" - "$UVICORN_CMD" ) +# Add uvicorn workers on separate ports (behind nginx least_conn) +for ((w=0; w/dev/null || true) - - for child in $children; do - # Recursively get descendants of each child - descendants="$descendants $child $(get_descendants "$child")" - done - - echo "$descendants" -} - -# Function to kill a process and all its descendants -kill_process_tree() { - local pid=$1 - local signal=$2 - local descendants - - descendants=$(get_descendants "$pid") - - # Kill children first (bottom-up), then parent - for desc_pid in $descendants; do - if kill -0 "$desc_pid" 2>/dev/null; then - kill "$signal" "$desc_pid" 2>/dev/null || true - fi - done - - # Kill the parent - if kill -0 "$pid" 2>/dev/null; then - kill "$signal" "$pid" 2>/dev/null || true - fi -} - -for name in "${SERVICE_NAMES[@]}"; do - pidfile="$RUN_DIR/$name.pid" - - if [[ -f $pidfile ]]; then - oldpid=$(<"$pidfile") - - if kill -0 "$oldpid" 2>/dev/null; then - echo "Stopping $name (PID $oldpid and all descendants)…" - - # Kill the entire process tree (parent + all descendants) - kill_process_tree "$oldpid" "-TERM" - sleep 4 - - # Check if parent or any descendants are still alive - still_alive=false - if kill -0 "$oldpid" 2>/dev/null; then - still_alive=true - else - for desc_pid in $(get_descendants "$oldpid"); do - if kill -0 "$desc_pid" 2>/dev/null; then - still_alive=true - break - fi - done - fi - - if $still_alive; then - echo "⚠️ $name did not exit cleanly, forcing stop..." - kill_process_tree "$oldpid" "-KILL" - sleep 1 - fi - fi - - rm -f "$pidfile" - else - echo "No PID file for $name, skipping stop." - fi -done - -# Clean up any port tracking files for uvicorn -rm -f "$RUN_DIR/uvicorn.port" "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn_old.pid" +NGINX_UPSTREAM_TEMPLATE="$BASE_DIR/nginx/dograh_upstream.conf.template" +NGINX_UPSTREAM_CONF="/etc/nginx/conf.d/dograh_upstream.conf" ############################################################################### -### 5) Run migrations +### 4) Run migrations ############################################################################### alembic -c "$BASE_DIR/api/alembic.ini" upgrade head ############################################################################### -### 6) Prepare logs +### 7) Prepare logs ############################################################################### mkdir -p "$BASE_LOG_DIR" "$LOG_DIR" @@ -232,7 +129,7 @@ echo "Log directory: $LOG_DIR" echo "Latest symlink: $LATEST_LINK -> $TIMESTAMP" ############################################################################### -### 7) Start services +### 8) Start services ############################################################################### for i in "${!SERVICE_NAMES[@]}"; do @@ -255,22 +152,47 @@ for i in "${!SERVICE_NAMES[@]}"; do echo $pid >"$RUN_DIR/$name.pid" echo " Started with PID $pid" - if [[ "$name" == "uvicorn" ]]; then - echo "$FASTAPI_PORT" >"$RUN_DIR/uvicorn.port" - fi done +# Cold start always uses band A (for rolling_update.sh dual-band strategy) +echo "A" > "$RUN_DIR/active_band" + ############################################################################### -### 8) Summary +### 8) Generate nginx upstream config & reload +############################################################################### + +if [[ -f "$NGINX_UPSTREAM_TEMPLATE" ]]; then + # Build upstream server list from worker ports + UPSTREAM_SERVERS="" + for ((w=0; w /dev/null + + echo "Generated nginx upstream config with $FASTAPI_WORKERS workers (ports ${UVICORN_BASE_PORT}-$((UVICORN_BASE_PORT + FASTAPI_WORKERS - 1)))" + + # Test and reload nginx + if sudo nginx -t 2>/dev/null; then + sudo systemctl reload nginx + echo "Nginx reloaded successfully" + else + echo "ERROR: nginx config test failed, not reloading" + sudo nginx -t + exit 1 + fi +fi + +############################################################################### +### 9) Summary ############################################################################### echo echo "──────────────────────────────────────────────────" -if $DEV_MODE; then - echo "Mode: DEVELOPMENT (auto-reload enabled)" -else - echo "Mode: PRODUCTION" -fi +echo "Mode: PRODUCTION" echo "" for name in "${SERVICE_NAMES[@]}"; do pid=$(<"$RUN_DIR/$name.pid") @@ -284,8 +206,3 @@ echo "Logs: tail -f $LOG_DIR/*.log" echo "Rotated logs: ls $LOG_DIR/*.log.*" echo "To stop: ./scripts/stop_services.sh" echo "──────────────────────────────────────────────────" - -# In Docker mode, wait for all background processes to keep container alive -if [[ "$WAIT_FOR_PROCESSES" == "true" ]]; then - wait -fi diff --git a/scripts/start_services_dev.sh b/scripts/start_services_dev.sh new file mode 100755 index 0000000..86e54d7 --- /dev/null +++ b/scripts/start_services_dev.sh @@ -0,0 +1,219 @@ +#!/usr/bin/env bash +set -e # Exit on error + +############################################################################### +### CONFIGURATION +############################################################################### + +# Determine BASE_DIR as parent of the scripts directory +BASE_DIR="$(cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd)" + +ENV_FILE="$BASE_DIR/api/.env" +RUN_DIR="$BASE_DIR/run" # Where we keep *.pid +BASE_LOG_DIR="$BASE_DIR/logs" # Base logs directory + +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") +LOG_DIR="$BASE_LOG_DIR/$TIMESTAMP" # Timestamped log directory +LATEST_LINK="$BASE_LOG_DIR/latest" # Symlink to latest logs +VENV_PATH="$BASE_DIR/venv" + +ARQ_WORKERS=${ARQ_WORKERS:-1} +LOG_TO_FILE=${LOG_TO_FILE:-true} + +cd "$BASE_DIR" +echo "Starting Dograh Services (DEV MODE) at $(date) in BASE_DIR: ${BASE_DIR}" +echo "Auto-reload enabled for api/ directory changes" + +############################################################################### +### 1) Load environment variables +############################################################################### + +if [[ -f "$ENV_FILE" ]]; then + set -a && . "$ENV_FILE" && set +a +fi + +UVICORN_BASE_PORT=${UVICORN_BASE_PORT:-8000} + +############################################################################### +### 2) Define services +############################################################################### + +SERVICE_NAMES=( + "ari_manager" + "campaign_orchestrator" + "uvicorn" +) + +SERVICE_COMMANDS=( + "python -m api.services.telephony.ari_manager" + "python -m api.services.campaign.campaign_orchestrator" + "uvicorn api.app:app --host 0.0.0.0 --port $UVICORN_BASE_PORT --reload --reload-dir api" +) + +# Add ARQ workers dynamically +for ((i=1; i<=ARQ_WORKERS; i++)); do + SERVICE_NAMES+=("arq$i") + SERVICE_COMMANDS+=("python -m arq api.tasks.arq.WorkerSettings --custom-log-dict api.tasks.arq.LOG_CONFIG") +done + +############################################################################### +### 3) Activate virtual environment +############################################################################### + +if [[ -d "$VENV_PATH" && -f "$VENV_PATH/bin/activate" ]]; then + source "$VENV_PATH/bin/activate" + echo "Virtual environment activated: $VENV_PATH" +else + echo "Warning: Virtual environment not found at $VENV_PATH" + echo "Continuing without virtual environment activation..." +fi + +############################################################################### +### 4) Stop old services +############################################################################### + +mkdir -p "$RUN_DIR" + +# Function to get all descendant PIDs of a process (children, grandchildren, etc.) +get_descendants() { + local parent_pid=$1 + local descendants="" + local children + + # Get direct children + children=$(pgrep -P "$parent_pid" 2>/dev/null || true) + + for child in $children; do + # Recursively get descendants of each child + descendants="$descendants $child $(get_descendants "$child")" + done + + echo "$descendants" +} + +# Function to kill a process and all its descendants +kill_process_tree() { + local pid=$1 + local signal=$2 + local descendants + + descendants=$(get_descendants "$pid") + + # Kill children first (bottom-up), then parent + for desc_pid in $descendants; do + if kill -0 "$desc_pid" 2>/dev/null; then + kill "$signal" "$desc_pid" 2>/dev/null || true + fi + done + + # Kill the parent + if kill -0 "$pid" 2>/dev/null; then + kill "$signal" "$pid" 2>/dev/null || true + fi +} + +for name in "${SERVICE_NAMES[@]}"; do + pidfile="$RUN_DIR/$name.pid" + + if [[ -f $pidfile ]]; then + oldpid=$(<"$pidfile") + + if kill -0 "$oldpid" 2>/dev/null; then + echo "Stopping $name (PID $oldpid and all descendants)…" + + kill_process_tree "$oldpid" "-TERM" + sleep 4 + + still_alive=false + if kill -0 "$oldpid" 2>/dev/null; then + still_alive=true + else + for desc_pid in $(get_descendants "$oldpid"); do + if kill -0 "$desc_pid" 2>/dev/null; then + still_alive=true + break + fi + done + fi + + if $still_alive; then + echo "⚠️ $name did not exit cleanly, forcing stop..." + kill_process_tree "$oldpid" "-KILL" + sleep 1 + fi + fi + + rm -f "$pidfile" + else + echo "No PID file for $name, skipping stop." + fi +done + +# Clean up legacy port tracking files +rm -f "$RUN_DIR/uvicorn.port" "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn_old.pid" "$RUN_DIR/active_band" + +############################################################################### +### 5) Run migrations +############################################################################### + +alembic -c "$BASE_DIR/api/alembic.ini" upgrade head + +############################################################################### +### 6) Prepare logs +############################################################################### + +mkdir -p "$BASE_LOG_DIR" "$LOG_DIR" + +if [[ -L "$LATEST_LINK" ]]; then + rm "$LATEST_LINK" +fi +ln -s "$TIMESTAMP" "$LATEST_LINK" + +echo "Log directory: $LOG_DIR" +echo "Latest symlink: $LATEST_LINK -> $TIMESTAMP" + +############################################################################### +### 7) Start services +############################################################################### + +for i in "${!SERVICE_NAMES[@]}"; do + name="${SERVICE_NAMES[$i]}" + cmd="${SERVICE_COMMANDS[$i]}" + echo "→ Starting $name" + + ( + cd "$BASE_DIR" + if [[ "$LOG_TO_FILE" == "true" ]]; then + export LOG_FILE_PATH="$LOG_DIR/$name.log" + exec $cmd >>"$LOG_DIR/$name.log" 2>&1 + else + exec $cmd + fi + ) & + + pid=$! + echo $pid >"$RUN_DIR/$name.pid" + echo " Started with PID $pid" + +done + +############################################################################### +### 8) Summary +############################################################################### + +echo +echo "──────────────────────────────────────────────────" +echo "Mode: DEVELOPMENT (auto-reload enabled)" +echo "" +for name in "${SERVICE_NAMES[@]}"; do + pid=$(<"$RUN_DIR/$name.pid") + echo "✓ $name (PID $pid) → $LOG_DIR/$name.log" +done +echo "" +echo " Rotation: ${LOG_ROTATION_SIZE:-100 MB}" +echo " Retention: ${LOG_RETENTION:-7 days}" +echo " Compression: ${LOG_COMPRESSION:-gz}" +echo "Logs: tail -f $LOG_DIR/*.log" +echo "Rotated logs: ls $LOG_DIR/*.log.*" +echo "To stop: ./scripts/stop_services.sh" +echo "──────────────────────────────────────────────────" diff --git a/scripts/stop_services.sh b/scripts/stop_services.sh index 887e1aa..504ecbd 100755 --- a/scripts/stop_services.sh +++ b/scripts/stop_services.sh @@ -42,17 +42,17 @@ kill_process_tree() { descendants=$(get_descendants "$pid") - # Kill children first (bottom-up), then parent + # Kill the parent first so supervisors don't respawn children + if kill -0 "$pid" 2>/dev/null; then + kill "$signal" "$pid" 2>/dev/null || true + fi + + # Then kill any remaining descendants for desc_pid in $descendants; do if kill -0 "$desc_pid" 2>/dev/null; then kill "$signal" "$desc_pid" 2>/dev/null || true fi done - - # Kill the parent - if kill -0 "$pid" 2>/dev/null; then - kill "$signal" "$pid" 2>/dev/null || true - fi } ############################################################################### @@ -113,14 +113,14 @@ for pidfile in "${pid_files[@]}"; do # Final check if kill -0 "$oldpid" 2>/dev/null; then echo " Error: Failed to stop $name (PID $oldpid)" - ((failed_count++)) + failed_count=$((failed_count + 1)) else echo " Stopped $name (forced)" - ((stopped_count++)) + stopped_count=$((stopped_count + 1)) fi else echo " Stopped $name" - ((stopped_count++)) + stopped_count=$((stopped_count + 1)) fi else echo "Service $name (PID $oldpid) is not running" @@ -130,8 +130,8 @@ for pidfile in "${pid_files[@]}"; do fi done -# Clean up any port tracking files for uvicorn -rm -f "$RUN_DIR/uvicorn.port" "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn_old.pid" +# Clean up any port tracking files for uvicorn and band tracking +rm -f "$RUN_DIR/uvicorn.port" "$RUN_DIR/uvicorn_new.port" "$RUN_DIR/uvicorn_old.pid" "$RUN_DIR/active_band" ############################################################################### ### SUMMARY