fix: docker one click setup

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-05-20 01:25:07 -07:00
parent 8174949b38
commit b285293b4e
10 changed files with 681 additions and 27 deletions

View file

@ -20,6 +20,18 @@
# - Backend .env: SEARXNG_DEFAULT_HOST=http://localhost:${SEARXNG_PORT:-8888}
# - Backend .env: CELERY_BROKER_URL / REDIS_APP_URL → redis://localhost:6379/0
# - Web .env: NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:${ZERO_CACHE_PORT:-4848}
#
# IMPORTANT — schema migrations:
# This compose file does NOT build the backend image and therefore cannot
# run a `migrations` service. You MUST run alembic on the host before
# bringing zero-cache up, or zero-cache will crash-loop with
# `Unknown or invalid publications. Specified: [zero_publication]`.
#
# First-time / after-pull:
# cd surfsense_backend && uv run alembic upgrade head
#
# The other compose files (docker-compose.yml, docker-compose.dev.yml)
# handle this automatically via a dedicated `migrations` service.
# =============================================================================
name: surfsense-deps
@ -82,6 +94,10 @@ services:
timeout: 5s
retries: 5
# NOTE: zero-cache requires the `zero_publication` Postgres publication to
# exist before it starts. In this deps-only stack there is no backend
# container to run migrations, so you must run `uv run alembic upgrade head`
# from `surfsense_backend/` on the host BEFORE `docker compose up -d`.
zero-cache:
image: rocicorp/zero:1.4.0
ports:

View file

@ -34,6 +34,25 @@ services:
timeout: 5s
retries: 5
# Short-lived schema runner; see docker/docker-compose.yml `migrations`
# service for the full rationale. Builds from the same backend context as
# the dev backend/celery services.
migrations:
build: *backend-build
env_file:
- ../surfsense_backend/.env
environment:
- DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
- PYTHONPATH=/app
- SERVICE_ROLE=migrate
- MIGRATION_TIMEOUT=${MIGRATION_TIMEOUT:-900}
volumes:
- zero_init:/zero-init
depends_on:
db:
condition: service_healthy
restart: "no"
pgadmin:
image: dpage/pgadmin4
ports:
@ -111,8 +130,10 @@ services:
condition: service_healthy
searxng:
condition: service_healthy
migrations:
condition: service_completed_successfully
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
interval: 15s
timeout: 5s
retries: 30
@ -141,6 +162,8 @@ services:
condition: service_healthy
redis:
condition: service_healthy
migrations:
condition: service_completed_successfully
backend:
condition: service_healthy
@ -160,6 +183,8 @@ services:
condition: service_healthy
redis:
condition: service_healthy
migrations:
condition: service_completed_successfully
celery_worker:
condition: service_started
@ -185,8 +210,10 @@ services:
extra_hosts:
- "host.docker.internal:host-gateway"
depends_on:
backend:
db:
condition: service_healthy
migrations:
condition: service_completed_successfully
environment:
- ZERO_UPSTREAM_DB=${ZERO_UPSTREAM_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
- ZERO_CVR_DB=${ZERO_CVR_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
@ -201,6 +228,12 @@ services:
- ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
volumes:
- zero_cache_data:/data
- zero_init:/zero-init
# Wrapper: see docker/docker-compose.yml `zero-cache` for rationale.
entrypoint: ["sh", "-c"]
# Pass the script as a single list element so Compose does not tokenize it.
command:
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
@ -238,3 +271,5 @@ volumes:
name: surfsense-dev-shared-temp
zero_cache_data:
name: surfsense-dev-zero-cache
zero_init:
name: surfsense-dev-zero-init

View file

@ -27,6 +27,28 @@ services:
timeout: 5s
retries: 5
# Short-lived schema runner. Executes `alembic upgrade head` and verifies
# that the `zero_publication` Postgres logical-replication publication
# exists, then exits 0. Downstream services (backend, celery_*, zero-cache)
# gate on this with `condition: service_completed_successfully` so a failed
# migration halts the whole stack instead of silently producing a half-built
# system that crash-loops zero-cache on missing publications.
migrations:
image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}
env_file:
- .env
environment:
DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-surfsense}:${DB_PASSWORD:-surfsense}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
PYTHONPATH: /app
SERVICE_ROLE: migrate
MIGRATION_TIMEOUT: ${MIGRATION_TIMEOUT:-900}
volumes:
- zero_init:/zero-init
depends_on:
db:
condition: service_healthy
restart: "no"
redis:
image: redis:8-alpine
volumes:
@ -88,9 +110,11 @@ services:
condition: service_healthy
searxng:
condition: service_healthy
migrations:
condition: service_completed_successfully
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
interval: 15s
timeout: 5s
retries: 30
@ -118,6 +142,8 @@ services:
condition: service_healthy
redis:
condition: service_healthy
migrations:
condition: service_completed_successfully
backend:
condition: service_healthy
labels:
@ -140,6 +166,8 @@ services:
condition: service_healthy
redis:
condition: service_healthy
migrations:
condition: service_completed_successfully
celery_worker:
condition: service_started
labels:
@ -182,10 +210,21 @@ services:
ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
volumes:
- zero_cache_data:/data
- zero_init:/zero-init
# Wrapper: if the migrations service flagged a publication change via
# /zero-init/needs_reset, wipe the SQLite replica before starting so
# zero-cache does a clean initial sync. Recovers from the half-built
# replica state (`_zero.tableMetadata` missing) caused by earlier crashes.
entrypoint: ["sh", "-c"]
# Pass the script as a single list element so Compose does not tokenize it.
command:
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
restart: unless-stopped
depends_on:
backend:
db:
condition: service_healthy
migrations:
condition: service_completed_successfully
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
interval: 10s
@ -221,3 +260,5 @@ volumes:
name: surfsense-shared-temp
zero_cache_data:
name: surfsense-zero-cache
zero_init:
name: surfsense-zero-init

View file

@ -97,6 +97,161 @@ function Wait-ForPostgres {
Write-Ok "PostgreSQL is ready."
}
# ── Stack health helpers ────────────────────────────────────────────────────
function Get-ComposeServices {
Push-Location $InstallDir
try {
$raw = Invoke-NativeSafe { docker compose ps -a --format json 2>$null }
} finally {
Pop-Location
}
if ([string]::IsNullOrWhiteSpace($raw)) { return @() }
# Compose v2.21+ emits a JSON array; older versions emit one object per line.
try {
$parsed = $raw | ConvertFrom-Json
if ($parsed -is [System.Collections.IEnumerable] -and -not ($parsed -is [string])) {
return @($parsed)
}
return @($parsed)
} catch {
$services = @()
foreach ($line in ($raw -split "`r?`n")) {
$line = $line.Trim()
if (-not $line) { continue }
try { $services += ($line | ConvertFrom-Json) } catch { }
}
return $services
}
}
function Wait-StackHealthy {
param([int]$TimeoutSec = 300)
$deadline = (Get-Date).AddSeconds($TimeoutSec)
$lastReport = ""
while ((Get-Date) -lt $deadline) {
$services = Get-ComposeServices
if (-not $services -or $services.Count -eq 0) {
Start-Sleep -Seconds 3
continue
}
$bad = @()
$waiting = @()
$good = @()
foreach ($svc in $services) {
$name = $svc.Service
$state = $svc.State
$health = if ($svc.PSObject.Properties.Name -contains 'Health') { $svc.Health } else { '' }
$exit = if ($svc.PSObject.Properties.Name -contains 'ExitCode') { $svc.ExitCode } else { $null }
if ($name -eq 'migrations') {
if ($state -eq 'exited' -and $exit -eq 0) { $good += $name }
elseif ($state -eq 'exited') { $bad += "${name} (exit=${exit})" }
else { $waiting += "${name} (${state})" }
continue
}
if ($state -eq 'running') {
if ([string]::IsNullOrEmpty($health) -or $health -eq 'healthy') {
$good += $name
} elseif ($health -eq 'starting') {
$waiting += "${name} (starting)"
} elseif ($health -eq 'unhealthy') {
$bad += "${name} (unhealthy)"
} else {
$waiting += "${name} (${health})"
}
} elseif ($state -eq 'restarting') {
$bad += "${name} (restarting)"
} elseif ($state -eq 'exited') {
$bad += "${name} (exited, code=${exit})"
} else {
$waiting += "${name} (${state})"
}
}
if ($bad.Count -gt 0) {
return @{ Ok = $false; Reason = 'failure'; Bad = $bad; Waiting = $waiting; Good = $good }
}
if ($waiting.Count -eq 0) {
return @{ Ok = $true; Reason = 'all_healthy'; Good = $good }
}
$report = "Waiting on: " + ($waiting -join ', ')
if ($report -ne $lastReport) {
Write-Info $report
$lastReport = $report
}
Start-Sleep -Seconds 5
}
return @{ Ok = $false; Reason = 'timeout'; Bad = $bad; Waiting = $waiting; Good = $good }
}
function Test-StaleZeroCacheVolume {
$raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null }
if ([string]::IsNullOrWhiteSpace($raw)) { return $false }
$names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ }
$hasZeroCache = $names -contains 'surfsense-zero-cache'
$hasZeroInit = $names -contains 'surfsense-zero-init'
# Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init.
# Such a volume may hold a half-initialized SQLite replica from an earlier
# crash-loop. Wiping it forces zero-cache to do a fresh initial sync.
return ($hasZeroCache -and -not $hasZeroInit)
}
function Invoke-StaleZeroCacheCleanup {
if (-not (Test-StaleZeroCacheVolume)) { return }
Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
Write-Warn "predates the migrations-service fix. It may contain a half-initialized"
Write-Warn "SQLite replica that would block zero-cache from starting."
Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
Start-Sleep -Seconds 5
Push-Location $InstallDir
Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null
Pop-Location
Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null
Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
}
function Write-Err-NoExit {
param([string]$Message)
Write-Host "[ERROR] $Message" -ForegroundColor Red
}
function Invoke-StackFailureReport {
param([hashtable]$Result)
Write-Host ""
Write-Err-NoExit "Stack did not reach a healthy state."
if ($Result.Bad.Count -gt 0) { Write-Host (" Failed: " + ($Result.Bad -join ', ')) }
if ($Result.Waiting.Count -gt 0) { Write-Host (" Stuck: " + ($Result.Waiting -join ', ')) }
Write-Host ""
Write-Info "Recent logs from migrations / zero-cache / backend:"
Push-Location $InstallDir
try {
Invoke-NativeSafe { docker compose logs --tail=60 migrations zero-cache backend 2>&1 } | Write-Host
} finally {
Pop-Location
}
Write-Host ""
Write-Host "Recovery hints:" -ForegroundColor Yellow
Write-Host " 1. Inspect migrations: cd $InstallDir; docker compose logs migrations"
Write-Host " 2. Verify publication: cd $InstallDir; docker compose exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
Write-Host " 3. Hard reset zero db: cd $InstallDir; docker compose down; docker volume rm surfsense-zero-cache; docker compose up -d"
Write-Host ""
exit 1
}
# ── Download files ──────────────────────────────────────────────────────────
Write-Step "Downloading SurfSense files"
@ -191,6 +346,8 @@ if (-not (Test-Path $envPath)) {
# ── Start containers ────────────────────────────────────────────────────────
Invoke-StaleZeroCacheCleanup
if ($MigrationMode) {
$envContent = Get-Content $envPath
$DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1
@ -251,7 +408,13 @@ if ($MigrationMode) {
Push-Location $InstallDir
Invoke-NativeSafe { docker compose up -d }
Pop-Location
Write-Ok "All services started."
Write-Ok "All containers started; waiting for stack to become healthy..."
$waitResult = Wait-StackHealthy -TimeoutSec 300
if (-not $waitResult.Ok) {
Invoke-StackFailureReport -Result $waitResult
}
Write-Ok "All services healthy."
Remove-Item $KeyFile -ErrorAction SilentlyContinue
@ -260,7 +423,13 @@ if ($MigrationMode) {
Push-Location $InstallDir
Invoke-NativeSafe { docker compose up -d }
Pop-Location
Write-Ok "All services started."
Write-Ok "All containers started; waiting for stack to become healthy..."
$waitResult = Wait-StackHealthy -TimeoutSec 300
if (-not $waitResult.Ok) {
Invoke-StackFailureReport -Result $waitResult
}
Write-Ok "All services healthy."
}
# ── Watchtower (auto-update) ────────────────────────────────────────────────

View file

@ -97,6 +97,163 @@ wait_for_pg() {
success "PostgreSQL is ready."
}
# ── Stack health helpers ─────────────────────────────────────────────────────
# Enumerate compose services for project `surfsense` as `service|state|health|exitcode`
# lines. Uses `docker inspect` so we don't depend on `jq`, `python3`, or the
# exact ordering of fields in `docker compose ps --format json` output.
get_compose_services() {
local containers
containers=$(docker ps -a --filter "label=com.docker.compose.project=surfsense" --format '{{.Names}}' 2>/dev/null) || true
[[ -z "$containers" ]] && return 0
while IFS= read -r container; do
[[ -z "$container" ]] && continue
local svc state health code
svc=$(docker inspect -f '{{index .Config.Labels "com.docker.compose.service"}}' "$container" 2>/dev/null || echo "")
state=$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
health=$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{end}}' "$container" 2>/dev/null || echo "")
code=$(docker inspect -f '{{.State.ExitCode}}' "$container" 2>/dev/null || echo "")
[[ -z "$svc" ]] && continue
printf '%s|%s|%s|%s\n' "$svc" "$state" "$health" "$code"
done <<< "$containers"
}
# Globals populated by wait_stack_healthy / consumed by stack_failure_report.
STACK_BAD=()
STACK_WAITING=()
STACK_GOOD=()
STACK_TIMEOUT=false
wait_stack_healthy() {
local timeout_sec=${1:-300}
local deadline=$(($(date +%s) + timeout_sec))
local last_report=""
local bad=()
local waiting=()
local good=()
while [[ $(date +%s) -lt $deadline ]]; do
local lines
lines=$(get_compose_services)
if [[ -z "$lines" ]]; then
sleep 3
continue
fi
bad=()
waiting=()
good=()
while IFS='|' read -r name state health code; do
[[ -z "$name" ]] && continue
if [[ "$name" == "migrations" ]]; then
if [[ "$state" == "exited" && "$code" == "0" ]]; then
good+=("$name")
elif [[ "$state" == "exited" ]]; then
bad+=("${name} (exit=${code})")
else
waiting+=("${name} (${state})")
fi
continue
fi
if [[ "$state" == "running" ]]; then
if [[ -z "$health" || "$health" == "healthy" ]]; then
good+=("$name")
elif [[ "$health" == "starting" ]]; then
waiting+=("${name} (starting)")
elif [[ "$health" == "unhealthy" ]]; then
bad+=("${name} (unhealthy)")
else
waiting+=("${name} (${health})")
fi
elif [[ "$state" == "restarting" ]]; then
bad+=("${name} (restarting)")
elif [[ "$state" == "exited" ]]; then
bad+=("${name} (exited, code=${code})")
else
waiting+=("${name} (${state})")
fi
done <<< "$lines"
if (( ${#bad[@]} > 0 )); then
STACK_BAD=("${bad[@]}")
STACK_WAITING=("${waiting[@]}")
STACK_GOOD=("${good[@]}")
return 1
fi
if (( ${#waiting[@]} == 0 )); then
STACK_GOOD=("${good[@]}")
return 0
fi
local report="Waiting on: ${waiting[*]}"
if [[ "$report" != "$last_report" ]]; then
info "$report"
last_report="$report"
fi
sleep 5
done
# bad/waiting/good are declared at function scope so referencing them is
# safe even if the polling loop never executed its body.
STACK_BAD=()
[[ ${#bad[@]} -gt 0 ]] && STACK_BAD=("${bad[@]}")
STACK_WAITING=()
[[ ${#waiting[@]} -gt 0 ]] && STACK_WAITING=("${waiting[@]}")
STACK_GOOD=()
[[ ${#good[@]} -gt 0 ]] && STACK_GOOD=("${good[@]}")
STACK_TIMEOUT=true
return 1
}
stack_failure_report() {
echo ""
echo -e "\033[31m[ERROR]\033[0m Stack did not reach a healthy state."
if (( ${#STACK_BAD[@]} > 0 )) && [[ -n "${STACK_BAD[0]}" ]]; then
echo " Failed: ${STACK_BAD[*]}"
fi
if (( ${#STACK_WAITING[@]} > 0 )) && [[ -n "${STACK_WAITING[0]}" ]]; then
echo " Stuck: ${STACK_WAITING[*]}"
fi
echo ""
info "Recent logs from migrations / zero-cache / backend:"
(cd "${INSTALL_DIR}" && ${DC} logs --tail=60 migrations zero-cache backend 2>&1) || true
echo ""
echo "Recovery hints:"
echo " 1. Inspect migrations: cd ${INSTALL_DIR} && ${DC} logs migrations"
echo " 2. Verify publication: cd ${INSTALL_DIR} && ${DC} exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
echo " 3. Hard reset zero db: cd ${INSTALL_DIR} && ${DC} down && docker volume rm surfsense-zero-cache && ${DC} up -d"
echo ""
exit 1
}
# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not.
# That signals an install that predates the migrations-service fix; the old
# replica may be half-initialized and would block zero-cache on next start.
test_stale_zero_cache_volume() {
local has_zc has_zi
has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true)
has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true)
[[ -n "$has_zc" && -z "$has_zi" ]]
}
invoke_stale_zero_cache_cleanup() {
if ! test_stale_zero_cache_volume; then
return 0
fi
warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
warn "predates the migrations-service fix. It may contain a half-initialized"
warn "SQLite replica that would block zero-cache from starting."
warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
sleep 5
(cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true
docker volume rm surfsense-zero-cache 2>/dev/null || true
success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
}
# ── Download files ───────────────────────────────────────────────────────────
step "Downloading SurfSense files"
@ -186,6 +343,8 @@ fi
# ── Start containers ─────────────────────────────────────────────────────────
invoke_stale_zero_cache_cleanup
if $MIGRATION_MODE; then
# Read DB credentials from .env (fall back to defaults from docker-compose.yml)
DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true)
@ -243,7 +402,12 @@ if $MIGRATION_MODE; then
step "Starting all SurfSense services"
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
success "All services started."
success "All containers started; waiting for stack to become healthy..."
if ! wait_stack_healthy 300; then
stack_failure_report
fi
success "All services healthy."
# Key file is no longer needed — SECRET_KEY is now in .env
rm -f "${KEY_FILE}"
@ -251,7 +415,12 @@ if $MIGRATION_MODE; then
else
step "Starting SurfSense"
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
success "All services started."
success "All containers started; waiting for stack to become healthy..."
if ! wait_stack_healthy 300; then
stack_failure_report
fi
success "All services healthy."
fi
# ── Watchtower (auto-update) ─────────────────────────────────────────────────