fix: docker one click setup

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-05-20 01:25:07 -07:00
parent 8174949b38
commit b285293b4e
10 changed files with 681 additions and 27 deletions

7
.gitignore vendored
View file

@ -6,16 +6,15 @@ node_modules/
.venv
.pnpm-store
.DS_Store
deepagents/
debug.log
opencode/
references/
references
# Playwright (E2E test artifacts)
surfsense_web/playwright/.auth/
surfsense_web/playwright-report/
surfsense_web/test-results/
surfsense_web/blob-report/
hermes-agent
hermes-agent/
content_research/

View file

@ -20,6 +20,18 @@
# - Backend .env: SEARXNG_DEFAULT_HOST=http://localhost:${SEARXNG_PORT:-8888}
# - Backend .env: CELERY_BROKER_URL / REDIS_APP_URL → redis://localhost:6379/0
# - Web .env: NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:${ZERO_CACHE_PORT:-4848}
#
# IMPORTANT — schema migrations:
# This compose file does NOT build the backend image and therefore cannot
# run a `migrations` service. You MUST run alembic on the host before
# bringing zero-cache up, or zero-cache will crash-loop with
# `Unknown or invalid publications. Specified: [zero_publication]`.
#
# First-time / after-pull:
# cd surfsense_backend && uv run alembic upgrade head
#
# The other compose files (docker-compose.yml, docker-compose.dev.yml)
# handle this automatically via a dedicated `migrations` service.
# =============================================================================
name: surfsense-deps
@ -82,6 +94,10 @@ services:
timeout: 5s
retries: 5
# NOTE: zero-cache requires the `zero_publication` Postgres publication to
# exist before it starts. In this deps-only stack there is no backend
# container to run migrations, so you must run `uv run alembic upgrade head`
# from `surfsense_backend/` on the host BEFORE `docker compose up -d`.
zero-cache:
image: rocicorp/zero:1.4.0
ports:

View file

@ -34,6 +34,25 @@ services:
timeout: 5s
retries: 5
# Short-lived schema runner; see docker/docker-compose.yml `migrations`
# service for the full rationale. Builds from the same backend context as
# the dev backend/celery services.
migrations:
build: *backend-build
env_file:
- ../surfsense_backend/.env
environment:
- DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
- PYTHONPATH=/app
- SERVICE_ROLE=migrate
- MIGRATION_TIMEOUT=${MIGRATION_TIMEOUT:-900}
volumes:
- zero_init:/zero-init
depends_on:
db:
condition: service_healthy
restart: "no"
pgadmin:
image: dpage/pgadmin4
ports:
@ -111,8 +130,10 @@ services:
condition: service_healthy
searxng:
condition: service_healthy
migrations:
condition: service_completed_successfully
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
interval: 15s
timeout: 5s
retries: 30
@ -141,6 +162,8 @@ services:
condition: service_healthy
redis:
condition: service_healthy
migrations:
condition: service_completed_successfully
backend:
condition: service_healthy
@ -160,6 +183,8 @@ services:
condition: service_healthy
redis:
condition: service_healthy
migrations:
condition: service_completed_successfully
celery_worker:
condition: service_started
@ -185,8 +210,10 @@ services:
extra_hosts:
- "host.docker.internal:host-gateway"
depends_on:
backend:
db:
condition: service_healthy
migrations:
condition: service_completed_successfully
environment:
- ZERO_UPSTREAM_DB=${ZERO_UPSTREAM_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
- ZERO_CVR_DB=${ZERO_CVR_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
@ -201,6 +228,12 @@ services:
- ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
volumes:
- zero_cache_data:/data
- zero_init:/zero-init
# Wrapper: see docker/docker-compose.yml `zero-cache` for rationale.
entrypoint: ["sh", "-c"]
# Pass the script as a single list element so Compose does not tokenize it.
command:
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
@ -238,3 +271,5 @@ volumes:
name: surfsense-dev-shared-temp
zero_cache_data:
name: surfsense-dev-zero-cache
zero_init:
name: surfsense-dev-zero-init

View file

@ -27,6 +27,28 @@ services:
timeout: 5s
retries: 5
# Short-lived schema runner. Executes `alembic upgrade head` and verifies
# that the `zero_publication` Postgres logical-replication publication
# exists, then exits 0. Downstream services (backend, celery_*, zero-cache)
# gate on this with `condition: service_completed_successfully` so a failed
# migration halts the whole stack instead of silently producing a half-built
# system that crash-loops zero-cache on missing publications.
migrations:
image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}
env_file:
- .env
environment:
DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-surfsense}:${DB_PASSWORD:-surfsense}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
PYTHONPATH: /app
SERVICE_ROLE: migrate
MIGRATION_TIMEOUT: ${MIGRATION_TIMEOUT:-900}
volumes:
- zero_init:/zero-init
depends_on:
db:
condition: service_healthy
restart: "no"
redis:
image: redis:8-alpine
volumes:
@ -88,9 +110,11 @@ services:
condition: service_healthy
searxng:
condition: service_healthy
migrations:
condition: service_completed_successfully
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
interval: 15s
timeout: 5s
retries: 30
@ -118,6 +142,8 @@ services:
condition: service_healthy
redis:
condition: service_healthy
migrations:
condition: service_completed_successfully
backend:
condition: service_healthy
labels:
@ -140,6 +166,8 @@ services:
condition: service_healthy
redis:
condition: service_healthy
migrations:
condition: service_completed_successfully
celery_worker:
condition: service_started
labels:
@ -182,10 +210,21 @@ services:
ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
volumes:
- zero_cache_data:/data
- zero_init:/zero-init
# Wrapper: if the migrations service flagged a publication change via
# /zero-init/needs_reset, wipe the SQLite replica before starting so
# zero-cache does a clean initial sync. Recovers from the half-built
# replica state (`_zero.tableMetadata` missing) caused by earlier crashes.
entrypoint: ["sh", "-c"]
# Pass the script as a single list element so Compose does not tokenize it.
command:
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
restart: unless-stopped
depends_on:
backend:
db:
condition: service_healthy
migrations:
condition: service_completed_successfully
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
interval: 10s
@ -221,3 +260,5 @@ volumes:
name: surfsense-shared-temp
zero_cache_data:
name: surfsense-zero-cache
zero_init:
name: surfsense-zero-init

View file

@ -97,6 +97,161 @@ function Wait-ForPostgres {
Write-Ok "PostgreSQL is ready."
}
# ── Stack health helpers ────────────────────────────────────────────────────
function Get-ComposeServices {
Push-Location $InstallDir
try {
$raw = Invoke-NativeSafe { docker compose ps -a --format json 2>$null }
} finally {
Pop-Location
}
if ([string]::IsNullOrWhiteSpace($raw)) { return @() }
# Compose v2.21+ emits a JSON array; older versions emit one object per line.
try {
$parsed = $raw | ConvertFrom-Json
if ($parsed -is [System.Collections.IEnumerable] -and -not ($parsed -is [string])) {
return @($parsed)
}
return @($parsed)
} catch {
$services = @()
foreach ($line in ($raw -split "`r?`n")) {
$line = $line.Trim()
if (-not $line) { continue }
try { $services += ($line | ConvertFrom-Json) } catch { }
}
return $services
}
}
function Wait-StackHealthy {
param([int]$TimeoutSec = 300)
$deadline = (Get-Date).AddSeconds($TimeoutSec)
$lastReport = ""
while ((Get-Date) -lt $deadline) {
$services = Get-ComposeServices
if (-not $services -or $services.Count -eq 0) {
Start-Sleep -Seconds 3
continue
}
$bad = @()
$waiting = @()
$good = @()
foreach ($svc in $services) {
$name = $svc.Service
$state = $svc.State
$health = if ($svc.PSObject.Properties.Name -contains 'Health') { $svc.Health } else { '' }
$exit = if ($svc.PSObject.Properties.Name -contains 'ExitCode') { $svc.ExitCode } else { $null }
if ($name -eq 'migrations') {
if ($state -eq 'exited' -and $exit -eq 0) { $good += $name }
elseif ($state -eq 'exited') { $bad += "${name} (exit=${exit})" }
else { $waiting += "${name} (${state})" }
continue
}
if ($state -eq 'running') {
if ([string]::IsNullOrEmpty($health) -or $health -eq 'healthy') {
$good += $name
} elseif ($health -eq 'starting') {
$waiting += "${name} (starting)"
} elseif ($health -eq 'unhealthy') {
$bad += "${name} (unhealthy)"
} else {
$waiting += "${name} (${health})"
}
} elseif ($state -eq 'restarting') {
$bad += "${name} (restarting)"
} elseif ($state -eq 'exited') {
$bad += "${name} (exited, code=${exit})"
} else {
$waiting += "${name} (${state})"
}
}
if ($bad.Count -gt 0) {
return @{ Ok = $false; Reason = 'failure'; Bad = $bad; Waiting = $waiting; Good = $good }
}
if ($waiting.Count -eq 0) {
return @{ Ok = $true; Reason = 'all_healthy'; Good = $good }
}
$report = "Waiting on: " + ($waiting -join ', ')
if ($report -ne $lastReport) {
Write-Info $report
$lastReport = $report
}
Start-Sleep -Seconds 5
}
return @{ Ok = $false; Reason = 'timeout'; Bad = $bad; Waiting = $waiting; Good = $good }
}
function Test-StaleZeroCacheVolume {
$raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null }
if ([string]::IsNullOrWhiteSpace($raw)) { return $false }
$names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ }
$hasZeroCache = $names -contains 'surfsense-zero-cache'
$hasZeroInit = $names -contains 'surfsense-zero-init'
# Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init.
# Such a volume may hold a half-initialized SQLite replica from an earlier
# crash-loop. Wiping it forces zero-cache to do a fresh initial sync.
return ($hasZeroCache -and -not $hasZeroInit)
}
function Invoke-StaleZeroCacheCleanup {
if (-not (Test-StaleZeroCacheVolume)) { return }
Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
Write-Warn "predates the migrations-service fix. It may contain a half-initialized"
Write-Warn "SQLite replica that would block zero-cache from starting."
Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
Start-Sleep -Seconds 5
Push-Location $InstallDir
Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null
Pop-Location
Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null
Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
}
function Write-Err-NoExit {
param([string]$Message)
Write-Host "[ERROR] $Message" -ForegroundColor Red
}
function Invoke-StackFailureReport {
param([hashtable]$Result)
Write-Host ""
Write-Err-NoExit "Stack did not reach a healthy state."
if ($Result.Bad.Count -gt 0) { Write-Host (" Failed: " + ($Result.Bad -join ', ')) }
if ($Result.Waiting.Count -gt 0) { Write-Host (" Stuck: " + ($Result.Waiting -join ', ')) }
Write-Host ""
Write-Info "Recent logs from migrations / zero-cache / backend:"
Push-Location $InstallDir
try {
Invoke-NativeSafe { docker compose logs --tail=60 migrations zero-cache backend 2>&1 } | Write-Host
} finally {
Pop-Location
}
Write-Host ""
Write-Host "Recovery hints:" -ForegroundColor Yellow
Write-Host " 1. Inspect migrations: cd $InstallDir; docker compose logs migrations"
Write-Host " 2. Verify publication: cd $InstallDir; docker compose exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
Write-Host " 3. Hard reset zero db: cd $InstallDir; docker compose down; docker volume rm surfsense-zero-cache; docker compose up -d"
Write-Host ""
exit 1
}
# ── Download files ──────────────────────────────────────────────────────────
Write-Step "Downloading SurfSense files"
@ -191,6 +346,8 @@ if (-not (Test-Path $envPath)) {
# ── Start containers ────────────────────────────────────────────────────────
Invoke-StaleZeroCacheCleanup
if ($MigrationMode) {
$envContent = Get-Content $envPath
$DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1
@ -251,7 +408,13 @@ if ($MigrationMode) {
Push-Location $InstallDir
Invoke-NativeSafe { docker compose up -d }
Pop-Location
Write-Ok "All services started."
Write-Ok "All containers started; waiting for stack to become healthy..."
$waitResult = Wait-StackHealthy -TimeoutSec 300
if (-not $waitResult.Ok) {
Invoke-StackFailureReport -Result $waitResult
}
Write-Ok "All services healthy."
Remove-Item $KeyFile -ErrorAction SilentlyContinue
@ -260,7 +423,13 @@ if ($MigrationMode) {
Push-Location $InstallDir
Invoke-NativeSafe { docker compose up -d }
Pop-Location
Write-Ok "All services started."
Write-Ok "All containers started; waiting for stack to become healthy..."
$waitResult = Wait-StackHealthy -TimeoutSec 300
if (-not $waitResult.Ok) {
Invoke-StackFailureReport -Result $waitResult
}
Write-Ok "All services healthy."
}
# ── Watchtower (auto-update) ────────────────────────────────────────────────

View file

@ -97,6 +97,163 @@ wait_for_pg() {
success "PostgreSQL is ready."
}
# ── Stack health helpers ─────────────────────────────────────────────────────
# Enumerate compose services for project `surfsense` as `service|state|health|exitcode`
# lines. Uses `docker inspect` so we don't depend on `jq`, `python3`, or the
# exact ordering of fields in `docker compose ps --format json` output.
get_compose_services() {
local containers
containers=$(docker ps -a --filter "label=com.docker.compose.project=surfsense" --format '{{.Names}}' 2>/dev/null) || true
[[ -z "$containers" ]] && return 0
while IFS= read -r container; do
[[ -z "$container" ]] && continue
local svc state health code
svc=$(docker inspect -f '{{index .Config.Labels "com.docker.compose.service"}}' "$container" 2>/dev/null || echo "")
state=$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
health=$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{end}}' "$container" 2>/dev/null || echo "")
code=$(docker inspect -f '{{.State.ExitCode}}' "$container" 2>/dev/null || echo "")
[[ -z "$svc" ]] && continue
printf '%s|%s|%s|%s\n' "$svc" "$state" "$health" "$code"
done <<< "$containers"
}
# Globals populated by wait_stack_healthy / consumed by stack_failure_report.
STACK_BAD=()
STACK_WAITING=()
STACK_GOOD=()
STACK_TIMEOUT=false
wait_stack_healthy() {
local timeout_sec=${1:-300}
local deadline=$(($(date +%s) + timeout_sec))
local last_report=""
local bad=()
local waiting=()
local good=()
while [[ $(date +%s) -lt $deadline ]]; do
local lines
lines=$(get_compose_services)
if [[ -z "$lines" ]]; then
sleep 3
continue
fi
bad=()
waiting=()
good=()
while IFS='|' read -r name state health code; do
[[ -z "$name" ]] && continue
if [[ "$name" == "migrations" ]]; then
if [[ "$state" == "exited" && "$code" == "0" ]]; then
good+=("$name")
elif [[ "$state" == "exited" ]]; then
bad+=("${name} (exit=${code})")
else
waiting+=("${name} (${state})")
fi
continue
fi
if [[ "$state" == "running" ]]; then
if [[ -z "$health" || "$health" == "healthy" ]]; then
good+=("$name")
elif [[ "$health" == "starting" ]]; then
waiting+=("${name} (starting)")
elif [[ "$health" == "unhealthy" ]]; then
bad+=("${name} (unhealthy)")
else
waiting+=("${name} (${health})")
fi
elif [[ "$state" == "restarting" ]]; then
bad+=("${name} (restarting)")
elif [[ "$state" == "exited" ]]; then
bad+=("${name} (exited, code=${code})")
else
waiting+=("${name} (${state})")
fi
done <<< "$lines"
if (( ${#bad[@]} > 0 )); then
STACK_BAD=("${bad[@]}")
STACK_WAITING=("${waiting[@]}")
STACK_GOOD=("${good[@]}")
return 1
fi
if (( ${#waiting[@]} == 0 )); then
STACK_GOOD=("${good[@]}")
return 0
fi
local report="Waiting on: ${waiting[*]}"
if [[ "$report" != "$last_report" ]]; then
info "$report"
last_report="$report"
fi
sleep 5
done
# bad/waiting/good are declared at function scope so referencing them is
# safe even if the polling loop never executed its body.
STACK_BAD=()
[[ ${#bad[@]} -gt 0 ]] && STACK_BAD=("${bad[@]}")
STACK_WAITING=()
[[ ${#waiting[@]} -gt 0 ]] && STACK_WAITING=("${waiting[@]}")
STACK_GOOD=()
[[ ${#good[@]} -gt 0 ]] && STACK_GOOD=("${good[@]}")
STACK_TIMEOUT=true
return 1
}
stack_failure_report() {
echo ""
echo -e "\033[31m[ERROR]\033[0m Stack did not reach a healthy state."
if (( ${#STACK_BAD[@]} > 0 )) && [[ -n "${STACK_BAD[0]}" ]]; then
echo " Failed: ${STACK_BAD[*]}"
fi
if (( ${#STACK_WAITING[@]} > 0 )) && [[ -n "${STACK_WAITING[0]}" ]]; then
echo " Stuck: ${STACK_WAITING[*]}"
fi
echo ""
info "Recent logs from migrations / zero-cache / backend:"
(cd "${INSTALL_DIR}" && ${DC} logs --tail=60 migrations zero-cache backend 2>&1) || true
echo ""
echo "Recovery hints:"
echo " 1. Inspect migrations: cd ${INSTALL_DIR} && ${DC} logs migrations"
echo " 2. Verify publication: cd ${INSTALL_DIR} && ${DC} exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
echo " 3. Hard reset zero db: cd ${INSTALL_DIR} && ${DC} down && docker volume rm surfsense-zero-cache && ${DC} up -d"
echo ""
exit 1
}
# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not.
# That signals an install that predates the migrations-service fix; the old
# replica may be half-initialized and would block zero-cache on next start.
test_stale_zero_cache_volume() {
local has_zc has_zi
has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true)
has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true)
[[ -n "$has_zc" && -z "$has_zi" ]]
}
invoke_stale_zero_cache_cleanup() {
if ! test_stale_zero_cache_volume; then
return 0
fi
warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
warn "predates the migrations-service fix. It may contain a half-initialized"
warn "SQLite replica that would block zero-cache from starting."
warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
sleep 5
(cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true
docker volume rm surfsense-zero-cache 2>/dev/null || true
success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
}
# ── Download files ───────────────────────────────────────────────────────────
step "Downloading SurfSense files"
@ -186,6 +343,8 @@ fi
# ── Start containers ─────────────────────────────────────────────────────────
invoke_stale_zero_cache_cleanup
if $MIGRATION_MODE; then
# Read DB credentials from .env (fall back to defaults from docker-compose.yml)
DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true)
@ -243,7 +402,12 @@ if $MIGRATION_MODE; then
step "Starting all SurfSense services"
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
success "All services started."
success "All containers started; waiting for stack to become healthy..."
if ! wait_stack_healthy 300; then
stack_failure_report
fi
success "All services healthy."
# Key file is no longer needed — SECRET_KEY is now in .env
rm -f "${KEY_FILE}"
@ -251,7 +415,12 @@ if $MIGRATION_MODE; then
else
step "Starting SurfSense"
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
success "All services started."
success "All containers started; waiting for stack to become healthy..."
if ! wait_stack_healthy 300; then
stack_failure_report
fi
success "All services healthy."
fi
# ── Watchtower (auto-update) ─────────────────────────────────────────────────

View file

@ -167,10 +167,14 @@ COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
# SERVICE_ROLE controls which process this container runs:
# api FastAPI backend only (runs migrations on startup)
# migrate Run alembic upgrade head, verify zero_publication exists, exit 0.
# Used by the dedicated `migrations` service in docker-compose.yml
# so downstream services gate on `service_completed_successfully`.
# api FastAPI backend only (does NOT run migrations)
# worker Celery worker only
# beat Celery beat scheduler only
# all All three (legacy / dev default)
# all migrations + api + worker + beat (legacy / dev default;
# fails fast on migration error)
ENV SERVICE_ROLE=all
# Celery worker tuning (only used when SERVICE_ROLE=worker or all)

View file

@ -945,6 +945,36 @@ async def health_check():
return {"status": "ok"}
@app.get("/ready", tags=["health"])
@limiter.exempt
async def readiness_check():
"""Readiness probe.
Verifies that the schema state required by downstream services is
present. Specifically checks that the ``zero_publication`` Postgres
logical-replication publication exists; without it zero-cache crash-loops
on `Unknown or invalid publications`.
Returns 200 when ready, 503 otherwise. Used by the docker-compose
backend healthcheck and by ``install.ps1`` / ``install.sh`` post-up
verification.
"""
from sqlalchemy import text
from app.db import async_session_maker
async with async_session_maker() as session:
result = await session.execute(
text("SELECT 1 FROM pg_publication WHERE pubname = 'zero_publication'")
)
if result.first() is None:
raise HTTPException(
status_code=503,
detail="zero_publication missing; run alembic upgrade head",
)
return {"status": "ready"}
@app.get("/verify-token")
async def authenticated_route(
user: User = Depends(current_active_user),

View file

@ -4,10 +4,15 @@ set -e
# ─────────────────────────────────────────────────────────────
# SERVICE_ROLE controls which process(es) this container runs.
#
# api FastAPI backend only (runs migrations on startup)
# migrate Run `alembic upgrade head`, verify zero_publication,
# then exit 0. Used by the dedicated `migrations` service
# in docker-compose.yml so downstream services can gate
# on `condition: service_completed_successfully`.
# api FastAPI backend only (does NOT run migrations)
# worker Celery worker only
# beat Celery beat scheduler only
# all All three in one container (legacy / dev default)
# all migrations + api + worker + beat in one container
# (legacy / dev default; fails fast on migration error)
#
# Set SERVICE_ROLE as an environment variable in Coolify for
# each service deployment.
@ -41,7 +46,13 @@ cleanup() {
trap cleanup SIGTERM SIGINT
# ── Database migrations (only for api / all) ─────────────────
# ── Database migrations (only for migrate / all) ─────────────
# Fail-fast contract:
# - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s
# - zero_publication must exist in pg_publication afterwards
# Either failure exits non-zero so the dedicated `migrations` compose
# service exits non-zero, halting the rest of the stack instead of
# silently producing a half-built system that crash-loops zero-cache.
run_migrations() {
echo "Running database migrations..."
for i in {1..30}; do
@ -53,11 +64,66 @@ run_migrations() {
sleep 1
done
if timeout 300 alembic upgrade head 2>&1; then
echo "Migrations completed successfully."
else
echo "WARNING: Migration failed or timed out. Continuing anyway..."
echo "You may need to run migrations manually: alembic upgrade head"
local timeout_secs="${MIGRATION_TIMEOUT:-900}"
echo "Running alembic upgrade head (timeout=${timeout_secs}s)..."
if ! timeout "${timeout_secs}" alembic upgrade head; then
echo "ERROR: alembic upgrade head failed (or exceeded ${timeout_secs}s timeout)." >&2
echo "Refusing to start. Inspect the error above and re-run." >&2
exit 1
fi
echo "Migrations completed successfully."
echo "Verifying zero_publication exists in Postgres..."
local pub_oid
pub_oid=$(python <<'PY' 2>/dev/null || true
import asyncio
import sys
from sqlalchemy import text
from app.db import engine
async def get_oid():
async with engine.connect() as conn:
result = await conn.execute(
text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'")
)
row = result.first()
if row is None:
sys.exit(1)
print(int(row[0]))
asyncio.run(get_oid())
PY
)
if [ -z "${pub_oid}" ]; then
echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2
echo "This usually means migration 116 (or a later publication migration) did not run." >&2
echo "Inspect alembic state with:" >&2
echo " docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2
exit 1
fi
echo "zero_publication verified (oid=${pub_oid})."
# Stale-replica safety net: if /zero-init is mounted (i.e. we are the
# dedicated `migrations` compose service), drop a marker file when the
# publication oid changed (or on first run) so the wrapped zero-cache
# entrypoint can wipe /data/zero.db before starting. This recovers from
# the case where a previous zero-cache crashed mid-init and left a
# half-built SQLite replica without a `_zero.tableMetadata` table.
if [ -d /zero-init ]; then
local stored_oid=""
[ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true)
if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then
echo "Publication oid changed (stored=${stored_oid:-<none>}, current=${pub_oid}); writing /zero-init/needs_reset."
: > /zero-init/needs_reset
chmod 666 /zero-init/needs_reset 2>/dev/null || true
fi
echo "${pub_oid}" > /zero-init/last_pub_oid
chmod 666 /zero-init/last_pub_oid 2>/dev/null || true
# World-writable dir so the (possibly non-root) zero-cache container
# can `rm -f /zero-init/needs_reset` after acting on the marker.
chmod 777 /zero-init 2>/dev/null || true
fi
}
@ -102,8 +168,12 @@ start_beat() {
# ── Main: run based on role ──────────────────────────────────
case "${SERVICE_ROLE}" in
api)
migrate)
run_migrations
echo "Migrations complete; exiting cleanly."
exit 0
;;
api)
start_api
;;
worker)
@ -121,7 +191,7 @@ case "${SERVICE_ROLE}" in
start_beat
;;
*)
echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: api, worker, beat, or all"
echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: migrate, api, worker, beat, or all"
exit 1
;;
esac

View file

@ -71,7 +71,7 @@ Defaults work out of the box. Change `ZERO_ADMIN_PASSWORD` for security in produ
| `ZERO_UPSTREAM_DB` | PostgreSQL connection URL for replication (must be a direct connection, not via pgbouncer) | *(built from DB_* vars)* |
| `ZERO_CVR_DB` | PostgreSQL connection URL for client view records | *(built from DB_* vars)* |
| `ZERO_CHANGE_DB` | PostgreSQL connection URL for replication log entries | *(built from DB_* vars)* |
| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116) | `zero_publication` |
| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116, verified by the `migrations` service before `zero-cache` starts) | `zero_publication` |
| `ZERO_NUM_SYNC_WORKERS` | Number of view-sync worker processes. Must be ≤ connection pool sizes | `4` |
| `ZERO_UPSTREAM_MAX_CONNS` | Max connections to upstream PostgreSQL for mutations | `20` |
| `ZERO_CVR_MAX_CONNS` | Max connections to the CVR database | `30` |
@ -150,7 +150,9 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http
| Service | Description |
|---------|-------------|
| `db` | PostgreSQL with pgvector extension |
| `migrations` | Short-lived: runs `alembic upgrade head` and verifies `zero_publication`, then exits |
| `redis` | Message broker for Celery |
| `searxng` | Local privacy-respecting search backend |
| `backend` | FastAPI application server |
| `celery_worker` | Background task processing (document indexing, etc.) |
| `celery_beat` | Periodic task scheduler (connector sync) |
@ -159,7 +161,42 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http
All services start automatically with `docker compose up -d`.
The backend includes a health check. Dependent services (workers, frontend) wait until the API is fully ready before starting. You can monitor startup progress with `docker compose ps` (look for `(health: starting)` → `(healthy)`).
### How startup ordering works
Schema migrations run as a dedicated `migrations` service that exits 0 on
success and non-zero on failure. Every other backend-image service gates on
it via `condition: service_completed_successfully`:
```text
db (healthy) ──▶ migrations (alembic upgrade head + verify zero_publication)
├── exit 0 ─▶ backend ──▶ frontend
│ celery_worker
│ celery_beat
│ zero-cache ──▶ frontend
└── exit ≠ 0 ─▶ compose halts the rest of the stack
```
This guarantees `zero-cache` only starts after `zero_publication` exists in
Postgres. Before this design, a silent migration failure would leave
`zero-cache` crash-looping with `Unknown or invalid publications. Specified:
[zero_publication]. Found: []`.
### Readiness vs liveness
The backend exposes two endpoints:
- `GET /health` — lightweight liveness probe (always returns 200 if the
process is up).
- `GET /ready` — readiness probe that confirms `zero_publication` exists.
Returns 503 if not. The compose `backend.healthcheck` uses `/ready` so the
container only reports `healthy` once the schema is actually usable by
zero-cache.
You can also monitor startup progress with `docker compose ps` (look for
`(health: starting)` → `(healthy)`). The install script polls these states
automatically and times out after 5 minutes if the stack does not converge.
---
@ -188,6 +225,90 @@ docker compose down -v
- **Ports already in use**: Change the relevant `*_PORT` variable in `.env` and restart.
- **Permission errors on Linux**: You may need to prefix `docker` commands with `sudo`.
- **Zero-cache not starting**: Check `docker compose logs zero-cache`. Ensure PostgreSQL has `wal_level=logical` (configured automatically by the bundled `postgresql.conf`).
- **Real-time updates not working**: Open DevTools → Console and check for WebSocket errors. Verify `NEXT_PUBLIC_ZERO_CACHE_URL` matches the running zero-cache address.
- **Line ending issues on Windows**: Run `git config --global core.autocrlf true` before cloning.
### Migration service exited non-zero
The `migrations` service exits non-zero in two cases:
1. `alembic upgrade head` failed (timeout or SQL error).
2. `alembic` succeeded but `zero_publication` is still missing from
`pg_publication`.
Inspect the logs and the alembic state:
```bash
docker compose logs migrations
docker compose exec db psql -U surfsense -d surfsense \
-c 'SELECT * FROM alembic_version;'
docker compose exec db psql -U surfsense -d surfsense \
-c 'SELECT pubname FROM pg_publication;'
```
The default migration timeout is 900 seconds. Slow disks (Windows / WSL2)
may need more — set `MIGRATION_TIMEOUT` in `.env` to increase it.
### Zero-cache stuck on `Unknown or invalid publications`
Symptom (in `docker compose logs zero-cache`):
```text
Error: Unknown or invalid publications. Specified: [zero_publication]. Found: []
```
This means `zero-cache` started before `zero_publication` was created. With
the current compose files this should be impossible — the `migrations`
service blocks `zero-cache` from starting. If you see it, your stack
predates the fix or you brought up `zero-cache` manually with `docker
compose up zero-cache` before the migrations service ran.
Recovery:
```bash
docker compose down
docker volume rm surfsense-zero-cache # wipe half-built SQLite replica
docker compose up -d # migrations runs first, then zero-cache
```
The install script (`install.ps1` / `install.sh`) detects this case
automatically: if it finds a `surfsense-zero-cache` volume from a previous
install with no matching `surfsense-zero-init` volume, it removes the stale
volume before bringing the stack up.
### Zero-cache crashes with `_zero.tableMetadata` errors
This indicates a half-initialized SQLite replica left behind by a previous
crash. The `migrations` service writes a marker file on a shared volume
(`surfsense-zero-init`) when the publication oid changes; zero-cache wipes
its replica and re-syncs on next start. If the marker mechanism somehow did
not trigger, run the recovery one-liner above.
### Ensuring `wal_level = logical`
Logical replication is required by zero-cache. The bundled
`docker/postgresql.conf` sets `wal_level = logical` automatically. If you
swap in your own config or use a managed Postgres, confirm with:
```bash
docker compose exec db psql -U surfsense -d surfsense \
-c "SHOW wal_level;"
```
### Using `docker-compose.deps-only.yml`
`docker-compose.deps-only.yml` runs only the dependencies (Postgres, Redis,
SearXNG, zero-cache) on Docker while the backend and frontend run on the
host. Because there is no backend container in this stack, there is no
`migrations` service either, and you must run alembic on the host **before**
bringing the stack up:
```bash
cd surfsense_backend
uv run alembic upgrade head
cd ../docker
docker compose -f docker-compose.deps-only.yml up -d
```
If you skip the alembic step, `zero-cache` will crash-loop with `Unknown or
invalid publications. Specified: [zero_publication]`.