SurfSense/surfsense_backend/scripts/docker/entrypoint.sh
2026-05-20 01:25:07 -07:00

205 lines
7.7 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -e
# ─────────────────────────────────────────────────────────────
# SERVICE_ROLE controls which process(es) this container runs.
#
# migrate Run `alembic upgrade head`, verify zero_publication,
# then exit 0. Used by the dedicated `migrations` service
# in docker-compose.yml so downstream services can gate
# on `condition: service_completed_successfully`.
# api FastAPI backend only (does NOT run migrations)
# worker Celery worker only
# beat Celery beat scheduler only
# all migrations + api + worker + beat in one container
# (legacy / dev default; fails fast on migration error)
#
# Set SERVICE_ROLE as an environment variable in Coolify for
# each service deployment.
# ─────────────────────────────────────────────────────────────
SERVICE_ROLE="${SERVICE_ROLE:-all}"
echo "Starting SurfSense with SERVICE_ROLE=${SERVICE_ROLE}"
# ── Autoscale defaults (override via env) ────────────────────
# CELERY_MAX_WORKERS max concurrent worker processes
# CELERY_MIN_WORKERS min workers kept warm
# CELERY_QUEUES comma-separated queues to consume
# (empty = all queues for backward compat)
CELERY_MAX_WORKERS="${CELERY_MAX_WORKERS:-10}"
CELERY_MIN_WORKERS="${CELERY_MIN_WORKERS:-2}"
CELERY_MAX_TASKS_PER_CHILD="${CELERY_MAX_TASKS_PER_CHILD:-50}"
CELERY_QUEUES="${CELERY_QUEUES:-}"
# ── Graceful shutdown ────────────────────────────────────────
PIDS=()
cleanup() {
echo "Shutting down services..."
for pid in "${PIDS[@]}"; do
kill -TERM "$pid" 2>/dev/null || true
done
for pid in "${PIDS[@]}"; do
wait "$pid" 2>/dev/null || true
done
exit 0
}
trap cleanup SIGTERM SIGINT
# ── Database migrations (only for migrate / all) ─────────────
# Fail-fast contract:
# - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s
# - zero_publication must exist in pg_publication afterwards
# Either failure exits non-zero so the dedicated `migrations` compose
# service exits non-zero, halting the rest of the stack instead of
# silently producing a half-built system that crash-loops zero-cache.
run_migrations() {
echo "Running database migrations..."
for i in {1..30}; do
if python -c "from app.db import engine; import asyncio; asyncio.run(engine.dispose())" 2>/dev/null; then
echo "Database is ready."
break
fi
echo "Waiting for database... ($i/30)"
sleep 1
done
local timeout_secs="${MIGRATION_TIMEOUT:-900}"
echo "Running alembic upgrade head (timeout=${timeout_secs}s)..."
if ! timeout "${timeout_secs}" alembic upgrade head; then
echo "ERROR: alembic upgrade head failed (or exceeded ${timeout_secs}s timeout)." >&2
echo "Refusing to start. Inspect the error above and re-run." >&2
exit 1
fi
echo "Migrations completed successfully."
echo "Verifying zero_publication exists in Postgres..."
local pub_oid
pub_oid=$(python <<'PY' 2>/dev/null || true
import asyncio
import sys
from sqlalchemy import text
from app.db import engine
async def get_oid():
async with engine.connect() as conn:
result = await conn.execute(
text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'")
)
row = result.first()
if row is None:
sys.exit(1)
print(int(row[0]))
asyncio.run(get_oid())
PY
)
if [ -z "${pub_oid}" ]; then
echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2
echo "This usually means migration 116 (or a later publication migration) did not run." >&2
echo "Inspect alembic state with:" >&2
echo " docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2
exit 1
fi
echo "zero_publication verified (oid=${pub_oid})."
# Stale-replica safety net: if /zero-init is mounted (i.e. we are the
# dedicated `migrations` compose service), drop a marker file when the
# publication oid changed (or on first run) so the wrapped zero-cache
# entrypoint can wipe /data/zero.db before starting. This recovers from
# the case where a previous zero-cache crashed mid-init and left a
# half-built SQLite replica without a `_zero.tableMetadata` table.
if [ -d /zero-init ]; then
local stored_oid=""
[ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true)
if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then
echo "Publication oid changed (stored=${stored_oid:-<none>}, current=${pub_oid}); writing /zero-init/needs_reset."
: > /zero-init/needs_reset
chmod 666 /zero-init/needs_reset 2>/dev/null || true
fi
echo "${pub_oid}" > /zero-init/last_pub_oid
chmod 666 /zero-init/last_pub_oid 2>/dev/null || true
# World-writable dir so the (possibly non-root) zero-cache container
# can `rm -f /zero-init/needs_reset` after acting on the marker.
chmod 777 /zero-init 2>/dev/null || true
fi
}
# ── Service starters ─────────────────────────────────────────
start_api() {
echo "Starting FastAPI Backend..."
python main.py &
PIDS+=($!)
echo " FastAPI PID=${PIDS[-1]}"
}
start_worker() {
QUEUE_ARGS=""
if [ -n "${CELERY_QUEUES}" ]; then
QUEUE_ARGS="--queues=${CELERY_QUEUES}"
else
# When no queues specified, consume from BOTH the default queue and
# the connectors queue. Without --queues, Celery only consumes from
# the default queue, leaving connector indexing tasks stuck.
DEFAULT_Q="${CELERY_TASK_DEFAULT_QUEUE:-surfsense}"
QUEUE_ARGS="--queues=${DEFAULT_Q},${DEFAULT_Q}.connectors"
fi
echo "Starting Celery Worker (autoscale=${CELERY_MAX_WORKERS},${CELERY_MIN_WORKERS}, max-tasks-per-child=${CELERY_MAX_TASKS_PER_CHILD}, queues=${CELERY_QUEUES:-all})..."
celery -A app.celery_app worker \
--loglevel=info \
--autoscale="${CELERY_MAX_WORKERS},${CELERY_MIN_WORKERS}" \
--max-tasks-per-child="${CELERY_MAX_TASKS_PER_CHILD}" \
--prefetch-multiplier=1 \
-Ofair \
${QUEUE_ARGS} &
PIDS+=($!)
echo " Celery Worker PID=${PIDS[-1]}"
}
start_beat() {
echo "Starting Celery Beat..."
celery -A app.celery_app beat --loglevel=info &
PIDS+=($!)
echo " Celery Beat PID=${PIDS[-1]}"
}
# ── Main: run based on role ──────────────────────────────────
case "${SERVICE_ROLE}" in
migrate)
run_migrations
echo "Migrations complete; exiting cleanly."
exit 0
;;
api)
start_api
;;
worker)
start_worker
;;
beat)
start_beat
;;
all)
run_migrations
start_api
sleep 5
start_worker
sleep 3
start_beat
;;
*)
echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: migrate, api, worker, beat, or all"
exit 1
;;
esac
echo "All requested services started. PIDs: ${PIDS[*]}"
# Wait for any process to exit
wait -n
# If we get here, one process exited unexpectedly
exit $?