2025-10-23 15:49:16 -07:00
|
|
|
|
#!/bin/bash
|
|
|
|
|
|
set -e
|
|
|
|
|
|
|
2026-02-10 20:34:04 -08:00
|
|
|
|
# ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
# SERVICE_ROLE controls which process(es) this container runs.
|
|
|
|
|
|
#
|
2026-05-20 01:25:07 -07:00
|
|
|
|
# migrate – Run `alembic upgrade head`, verify zero_publication,
|
|
|
|
|
|
# then exit 0. Used by the dedicated `migrations` service
|
|
|
|
|
|
# in docker-compose.yml so downstream services can gate
|
|
|
|
|
|
# on `condition: service_completed_successfully`.
|
|
|
|
|
|
# api – FastAPI backend only (does NOT run migrations)
|
2026-02-10 20:34:04 -08:00
|
|
|
|
# worker – Celery worker only
|
|
|
|
|
|
# beat – Celery beat scheduler only
|
2026-05-20 01:25:07 -07:00
|
|
|
|
# all – migrations + api + worker + beat in one container
|
|
|
|
|
|
# (legacy / dev default; fails fast on migration error)
|
2026-02-10 20:34:04 -08:00
|
|
|
|
#
|
|
|
|
|
|
# Set SERVICE_ROLE as an environment variable in Coolify for
|
|
|
|
|
|
# each service deployment.
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
SERVICE_ROLE="${SERVICE_ROLE:-all}"
|
|
|
|
|
|
echo "Starting SurfSense with SERVICE_ROLE=${SERVICE_ROLE}"
|
|
|
|
|
|
|
|
|
|
|
|
# ── Autoscale defaults (override via env) ────────────────────
|
|
|
|
|
|
# CELERY_MAX_WORKERS – max concurrent worker processes
|
|
|
|
|
|
# CELERY_MIN_WORKERS – min workers kept warm
|
|
|
|
|
|
# CELERY_QUEUES – comma-separated queues to consume
|
|
|
|
|
|
# (empty = all queues for backward compat)
|
|
|
|
|
|
CELERY_MAX_WORKERS="${CELERY_MAX_WORKERS:-10}"
|
|
|
|
|
|
CELERY_MIN_WORKERS="${CELERY_MIN_WORKERS:-2}"
|
|
|
|
|
|
CELERY_MAX_TASKS_PER_CHILD="${CELERY_MAX_TASKS_PER_CHILD:-50}"
|
|
|
|
|
|
CELERY_QUEUES="${CELERY_QUEUES:-}"
|
|
|
|
|
|
|
|
|
|
|
|
# ── Graceful shutdown ────────────────────────────────────────
|
|
|
|
|
|
PIDS=()
|
|
|
|
|
|
|
2025-10-23 15:49:16 -07:00
|
|
|
|
cleanup() {
|
|
|
|
|
|
echo "Shutting down services..."
|
2026-02-10 20:34:04 -08:00
|
|
|
|
for pid in "${PIDS[@]}"; do
|
|
|
|
|
|
kill -TERM "$pid" 2>/dev/null || true
|
|
|
|
|
|
done
|
|
|
|
|
|
for pid in "${PIDS[@]}"; do
|
|
|
|
|
|
wait "$pid" 2>/dev/null || true
|
|
|
|
|
|
done
|
2025-10-23 15:49:16 -07:00
|
|
|
|
exit 0
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
trap cleanup SIGTERM SIGINT
|
|
|
|
|
|
|
2026-05-20 01:25:07 -07:00
|
|
|
|
# ── Database migrations (only for migrate / all) ─────────────
|
|
|
|
|
|
# Fail-fast contract:
|
|
|
|
|
|
# - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s
|
|
|
|
|
|
# - zero_publication must exist in pg_publication afterwards
|
|
|
|
|
|
# Either failure exits non-zero so the dedicated `migrations` compose
|
|
|
|
|
|
# service exits non-zero, halting the rest of the stack instead of
|
|
|
|
|
|
# silently producing a half-built system that crash-loops zero-cache.
|
2026-02-10 20:34:04 -08:00
|
|
|
|
run_migrations() {
|
|
|
|
|
|
echo "Running database migrations..."
|
|
|
|
|
|
for i in {1..30}; do
|
|
|
|
|
|
if python -c "from app.db import engine; import asyncio; asyncio.run(engine.dispose())" 2>/dev/null; then
|
|
|
|
|
|
echo "Database is ready."
|
|
|
|
|
|
break
|
|
|
|
|
|
fi
|
|
|
|
|
|
echo "Waiting for database... ($i/30)"
|
|
|
|
|
|
sleep 1
|
|
|
|
|
|
done
|
2026-01-16 11:32:06 -08:00
|
|
|
|
|
2026-05-20 01:25:07 -07:00
|
|
|
|
local timeout_secs="${MIGRATION_TIMEOUT:-900}"
|
|
|
|
|
|
echo "Running alembic upgrade head (timeout=${timeout_secs}s)..."
|
|
|
|
|
|
if ! timeout "${timeout_secs}" alembic upgrade head; then
|
|
|
|
|
|
echo "ERROR: alembic upgrade head failed (or exceeded ${timeout_secs}s timeout)." >&2
|
|
|
|
|
|
echo "Refusing to start. Inspect the error above and re-run." >&2
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
echo "Migrations completed successfully."
|
|
|
|
|
|
|
|
|
|
|
|
echo "Verifying zero_publication exists in Postgres..."
|
|
|
|
|
|
local pub_oid
|
|
|
|
|
|
pub_oid=$(python <<'PY' 2>/dev/null || true
|
|
|
|
|
|
import asyncio
|
|
|
|
|
|
import sys
|
|
|
|
|
|
from sqlalchemy import text
|
|
|
|
|
|
from app.db import engine
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def get_oid():
|
|
|
|
|
|
async with engine.connect() as conn:
|
|
|
|
|
|
result = await conn.execute(
|
|
|
|
|
|
text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'")
|
|
|
|
|
|
)
|
|
|
|
|
|
row = result.first()
|
|
|
|
|
|
if row is None:
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
print(int(row[0]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
asyncio.run(get_oid())
|
|
|
|
|
|
PY
|
|
|
|
|
|
)
|
|
|
|
|
|
if [ -z "${pub_oid}" ]; then
|
|
|
|
|
|
echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2
|
|
|
|
|
|
echo "This usually means migration 116 (or a later publication migration) did not run." >&2
|
|
|
|
|
|
echo "Inspect alembic state with:" >&2
|
|
|
|
|
|
echo " docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
echo "zero_publication verified (oid=${pub_oid})."
|
|
|
|
|
|
|
|
|
|
|
|
# Stale-replica safety net: if /zero-init is mounted (i.e. we are the
|
|
|
|
|
|
# dedicated `migrations` compose service), drop a marker file when the
|
|
|
|
|
|
# publication oid changed (or on first run) so the wrapped zero-cache
|
|
|
|
|
|
# entrypoint can wipe /data/zero.db before starting. This recovers from
|
|
|
|
|
|
# the case where a previous zero-cache crashed mid-init and left a
|
|
|
|
|
|
# half-built SQLite replica without a `_zero.tableMetadata` table.
|
|
|
|
|
|
if [ -d /zero-init ]; then
|
|
|
|
|
|
local stored_oid=""
|
|
|
|
|
|
[ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true)
|
|
|
|
|
|
if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then
|
|
|
|
|
|
echo "Publication oid changed (stored=${stored_oid:-<none>}, current=${pub_oid}); writing /zero-init/needs_reset."
|
|
|
|
|
|
: > /zero-init/needs_reset
|
|
|
|
|
|
chmod 666 /zero-init/needs_reset 2>/dev/null || true
|
|
|
|
|
|
fi
|
|
|
|
|
|
echo "${pub_oid}" > /zero-init/last_pub_oid
|
|
|
|
|
|
chmod 666 /zero-init/last_pub_oid 2>/dev/null || true
|
|
|
|
|
|
# World-writable dir so the (possibly non-root) zero-cache container
|
|
|
|
|
|
# can `rm -f /zero-init/needs_reset` after acting on the marker.
|
|
|
|
|
|
chmod 777 /zero-init 2>/dev/null || true
|
2026-02-10 20:34:04 -08:00
|
|
|
|
fi
|
|
|
|
|
|
}
|
2026-01-16 11:32:06 -08:00
|
|
|
|
|
2026-02-10 20:34:04 -08:00
|
|
|
|
# ── Service starters ─────────────────────────────────────────
|
|
|
|
|
|
start_api() {
|
|
|
|
|
|
echo "Starting FastAPI Backend..."
|
|
|
|
|
|
python main.py &
|
|
|
|
|
|
PIDS+=($!)
|
|
|
|
|
|
echo " FastAPI PID=${PIDS[-1]}"
|
|
|
|
|
|
}
|
2025-10-23 20:37:01 -07:00
|
|
|
|
|
2026-02-10 20:34:04 -08:00
|
|
|
|
start_worker() {
|
|
|
|
|
|
QUEUE_ARGS=""
|
|
|
|
|
|
if [ -n "${CELERY_QUEUES}" ]; then
|
|
|
|
|
|
QUEUE_ARGS="--queues=${CELERY_QUEUES}"
|
2026-02-13 16:16:02 -08:00
|
|
|
|
else
|
|
|
|
|
|
# When no queues specified, consume from BOTH the default queue and
|
|
|
|
|
|
# the connectors queue. Without --queues, Celery only consumes from
|
|
|
|
|
|
# the default queue, leaving connector indexing tasks stuck.
|
|
|
|
|
|
DEFAULT_Q="${CELERY_TASK_DEFAULT_QUEUE:-surfsense}"
|
|
|
|
|
|
QUEUE_ARGS="--queues=${DEFAULT_Q},${DEFAULT_Q}.connectors"
|
2026-02-10 20:34:04 -08:00
|
|
|
|
fi
|
2025-10-23 15:49:16 -07:00
|
|
|
|
|
2026-02-10 20:34:04 -08:00
|
|
|
|
echo "Starting Celery Worker (autoscale=${CELERY_MAX_WORKERS},${CELERY_MIN_WORKERS}, max-tasks-per-child=${CELERY_MAX_TASKS_PER_CHILD}, queues=${CELERY_QUEUES:-all})..."
|
|
|
|
|
|
celery -A app.celery_app worker \
|
|
|
|
|
|
--loglevel=info \
|
|
|
|
|
|
--autoscale="${CELERY_MAX_WORKERS},${CELERY_MIN_WORKERS}" \
|
|
|
|
|
|
--max-tasks-per-child="${CELERY_MAX_TASKS_PER_CHILD}" \
|
|
|
|
|
|
--prefetch-multiplier=1 \
|
|
|
|
|
|
-Ofair \
|
|
|
|
|
|
${QUEUE_ARGS} &
|
|
|
|
|
|
PIDS+=($!)
|
|
|
|
|
|
echo " Celery Worker PID=${PIDS[-1]}"
|
|
|
|
|
|
}
|
2025-10-23 15:49:16 -07:00
|
|
|
|
|
2026-02-10 20:34:04 -08:00
|
|
|
|
start_beat() {
|
|
|
|
|
|
echo "Starting Celery Beat..."
|
|
|
|
|
|
celery -A app.celery_app beat --loglevel=info &
|
|
|
|
|
|
PIDS+=($!)
|
|
|
|
|
|
echo " Celery Beat PID=${PIDS[-1]}"
|
|
|
|
|
|
}
|
2025-10-23 20:37:01 -07:00
|
|
|
|
|
2026-02-10 20:34:04 -08:00
|
|
|
|
# ── Main: run based on role ──────────────────────────────────
|
|
|
|
|
|
case "${SERVICE_ROLE}" in
|
2026-05-20 01:25:07 -07:00
|
|
|
|
migrate)
|
2026-02-10 20:34:04 -08:00
|
|
|
|
run_migrations
|
2026-05-20 01:25:07 -07:00
|
|
|
|
echo "Migrations complete; exiting cleanly."
|
|
|
|
|
|
exit 0
|
|
|
|
|
|
;;
|
|
|
|
|
|
api)
|
2026-02-10 20:34:04 -08:00
|
|
|
|
start_api
|
|
|
|
|
|
;;
|
|
|
|
|
|
worker)
|
|
|
|
|
|
start_worker
|
|
|
|
|
|
;;
|
|
|
|
|
|
beat)
|
|
|
|
|
|
start_beat
|
|
|
|
|
|
;;
|
|
|
|
|
|
all)
|
|
|
|
|
|
run_migrations
|
|
|
|
|
|
start_api
|
|
|
|
|
|
sleep 5
|
|
|
|
|
|
start_worker
|
|
|
|
|
|
sleep 3
|
|
|
|
|
|
start_beat
|
|
|
|
|
|
;;
|
|
|
|
|
|
*)
|
2026-05-20 01:25:07 -07:00
|
|
|
|
echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: migrate, api, worker, beat, or all"
|
2026-02-10 20:34:04 -08:00
|
|
|
|
exit 1
|
|
|
|
|
|
;;
|
|
|
|
|
|
esac
|
2025-10-23 15:49:16 -07:00
|
|
|
|
|
2026-02-10 20:34:04 -08:00
|
|
|
|
echo "All requested services started. PIDs: ${PIDS[*]}"
|
2025-10-23 15:49:16 -07:00
|
|
|
|
|
|
|
|
|
|
# Wait for any process to exit
|
|
|
|
|
|
wait -n
|
|
|
|
|
|
|
2026-02-10 20:34:04 -08:00
|
|
|
|
# If we get here, one process exited unexpectedly
|
2025-10-23 15:49:16 -07:00
|
|
|
|
exit $?
|