From b285293b4e999cf9a40251789503e5c97671f16a Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Wed, 20 May 2026 01:25:07 -0700 Subject: [PATCH] fix: docker one click setup --- .gitignore | 7 +- docker/docker-compose.deps-only.yml | 16 ++ docker/docker-compose.dev.yml | 39 +++- docker/docker-compose.yml | 45 ++++- docker/scripts/install.ps1 | 173 +++++++++++++++++- docker/scripts/install.sh | 173 +++++++++++++++++- surfsense_backend/Dockerfile | 8 +- surfsense_backend/app/app.py | 30 +++ .../scripts/docker/entrypoint.sh | 90 ++++++++- .../docker-installation/docker-compose.mdx | 127 ++++++++++++- 10 files changed, 681 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index ac2ff94c9..a99954efe 100644 --- a/.gitignore +++ b/.gitignore @@ -6,16 +6,15 @@ node_modules/ .venv .pnpm-store .DS_Store -deepagents/ debug.log -opencode/ + +references/ +references # Playwright (E2E test artifacts) surfsense_web/playwright/.auth/ surfsense_web/playwright-report/ surfsense_web/test-results/ surfsense_web/blob-report/ -hermes-agent -hermes-agent/ content_research/ diff --git a/docker/docker-compose.deps-only.yml b/docker/docker-compose.deps-only.yml index 31dcd8b26..2be0bfe6e 100644 --- a/docker/docker-compose.deps-only.yml +++ b/docker/docker-compose.deps-only.yml @@ -20,6 +20,18 @@ # - Backend .env: SEARXNG_DEFAULT_HOST=http://localhost:${SEARXNG_PORT:-8888} # - Backend .env: CELERY_BROKER_URL / REDIS_APP_URL → redis://localhost:6379/0 # - Web .env: NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:${ZERO_CACHE_PORT:-4848} +# +# IMPORTANT — schema migrations: +# This compose file does NOT build the backend image and therefore cannot +# run a `migrations` service. You MUST run alembic on the host before +# bringing zero-cache up, or zero-cache will crash-loop with +# `Unknown or invalid publications. Specified: [zero_publication]`. +# +# First-time / after-pull: +# cd surfsense_backend && uv run alembic upgrade head +# +# The other compose files (docker-compose.yml, docker-compose.dev.yml) +# handle this automatically via a dedicated `migrations` service. # ============================================================================= name: surfsense-deps @@ -82,6 +94,10 @@ services: timeout: 5s retries: 5 + # NOTE: zero-cache requires the `zero_publication` Postgres publication to + # exist before it starts. In this deps-only stack there is no backend + # container to run migrations, so you must run `uv run alembic upgrade head` + # from `surfsense_backend/` on the host BEFORE `docker compose up -d`. zero-cache: image: rocicorp/zero:1.4.0 ports: diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 5338a649e..53b8ea1a9 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -34,6 +34,25 @@ services: timeout: 5s retries: 5 + # Short-lived schema runner; see docker/docker-compose.yml `migrations` + # service for the full rationale. Builds from the same backend context as + # the dev backend/celery services. + migrations: + build: *backend-build + env_file: + - ../surfsense_backend/.env + environment: + - DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}} + - PYTHONPATH=/app + - SERVICE_ROLE=migrate + - MIGRATION_TIMEOUT=${MIGRATION_TIMEOUT:-900} + volumes: + - zero_init:/zero-init + depends_on: + db: + condition: service_healthy + restart: "no" + pgadmin: image: dpage/pgadmin4 ports: @@ -111,8 +130,10 @@ services: condition: service_healthy searxng: condition: service_healthy + migrations: + condition: service_completed_successfully healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + test: ["CMD", "curl", "-f", "http://localhost:8000/ready"] interval: 15s timeout: 5s retries: 30 @@ -141,6 +162,8 @@ services: condition: service_healthy redis: condition: service_healthy + migrations: + condition: service_completed_successfully backend: condition: service_healthy @@ -160,6 +183,8 @@ services: condition: service_healthy redis: condition: service_healthy + migrations: + condition: service_completed_successfully celery_worker: condition: service_started @@ -185,8 +210,10 @@ services: extra_hosts: - "host.docker.internal:host-gateway" depends_on: - backend: + db: condition: service_healthy + migrations: + condition: service_completed_successfully environment: - ZERO_UPSTREAM_DB=${ZERO_UPSTREAM_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}} - ZERO_CVR_DB=${ZERO_CVR_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}} @@ -201,6 +228,12 @@ services: - ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate} volumes: - zero_cache_data:/data + - zero_init:/zero-init + # Wrapper: see docker/docker-compose.yml `zero-cache` for rationale. + entrypoint: ["sh", "-c"] + # Pass the script as a single list element so Compose does not tokenize it. + command: + - 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache' restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"] @@ -238,3 +271,5 @@ volumes: name: surfsense-dev-shared-temp zero_cache_data: name: surfsense-dev-zero-cache + zero_init: + name: surfsense-dev-zero-init diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 18147a189..82d77f826 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -27,6 +27,28 @@ services: timeout: 5s retries: 5 + # Short-lived schema runner. Executes `alembic upgrade head` and verifies + # that the `zero_publication` Postgres logical-replication publication + # exists, then exits 0. Downstream services (backend, celery_*, zero-cache) + # gate on this with `condition: service_completed_successfully` so a failed + # migration halts the whole stack instead of silently producing a half-built + # system that crash-loops zero-cache on missing publications. + migrations: + image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest} + env_file: + - .env + environment: + DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-surfsense}:${DB_PASSWORD:-surfsense}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}} + PYTHONPATH: /app + SERVICE_ROLE: migrate + MIGRATION_TIMEOUT: ${MIGRATION_TIMEOUT:-900} + volumes: + - zero_init:/zero-init + depends_on: + db: + condition: service_healthy + restart: "no" + redis: image: redis:8-alpine volumes: @@ -88,9 +110,11 @@ services: condition: service_healthy searxng: condition: service_healthy + migrations: + condition: service_completed_successfully restart: unless-stopped healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + test: ["CMD", "curl", "-f", "http://localhost:8000/ready"] interval: 15s timeout: 5s retries: 30 @@ -118,6 +142,8 @@ services: condition: service_healthy redis: condition: service_healthy + migrations: + condition: service_completed_successfully backend: condition: service_healthy labels: @@ -140,6 +166,8 @@ services: condition: service_healthy redis: condition: service_healthy + migrations: + condition: service_completed_successfully celery_worker: condition: service_started labels: @@ -182,10 +210,21 @@ services: ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate} volumes: - zero_cache_data:/data + - zero_init:/zero-init + # Wrapper: if the migrations service flagged a publication change via + # /zero-init/needs_reset, wipe the SQLite replica before starting so + # zero-cache does a clean initial sync. Recovers from the half-built + # replica state (`_zero.tableMetadata` missing) caused by earlier crashes. + entrypoint: ["sh", "-c"] + # Pass the script as a single list element so Compose does not tokenize it. + command: + - 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache' restart: unless-stopped depends_on: - backend: + db: condition: service_healthy + migrations: + condition: service_completed_successfully healthcheck: test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"] interval: 10s @@ -221,3 +260,5 @@ volumes: name: surfsense-shared-temp zero_cache_data: name: surfsense-zero-cache + zero_init: + name: surfsense-zero-init diff --git a/docker/scripts/install.ps1 b/docker/scripts/install.ps1 index 0eb3886a2..60c4fd5df 100644 --- a/docker/scripts/install.ps1 +++ b/docker/scripts/install.ps1 @@ -97,6 +97,161 @@ function Wait-ForPostgres { Write-Ok "PostgreSQL is ready." } +# ── Stack health helpers ──────────────────────────────────────────────────── + +function Get-ComposeServices { + Push-Location $InstallDir + try { + $raw = Invoke-NativeSafe { docker compose ps -a --format json 2>$null } + } finally { + Pop-Location + } + if ([string]::IsNullOrWhiteSpace($raw)) { return @() } + + # Compose v2.21+ emits a JSON array; older versions emit one object per line. + try { + $parsed = $raw | ConvertFrom-Json + if ($parsed -is [System.Collections.IEnumerable] -and -not ($parsed -is [string])) { + return @($parsed) + } + return @($parsed) + } catch { + $services = @() + foreach ($line in ($raw -split "`r?`n")) { + $line = $line.Trim() + if (-not $line) { continue } + try { $services += ($line | ConvertFrom-Json) } catch { } + } + return $services + } +} + +function Wait-StackHealthy { + param([int]$TimeoutSec = 300) + + $deadline = (Get-Date).AddSeconds($TimeoutSec) + $lastReport = "" + + while ((Get-Date) -lt $deadline) { + $services = Get-ComposeServices + if (-not $services -or $services.Count -eq 0) { + Start-Sleep -Seconds 3 + continue + } + + $bad = @() + $waiting = @() + $good = @() + + foreach ($svc in $services) { + $name = $svc.Service + $state = $svc.State + $health = if ($svc.PSObject.Properties.Name -contains 'Health') { $svc.Health } else { '' } + $exit = if ($svc.PSObject.Properties.Name -contains 'ExitCode') { $svc.ExitCode } else { $null } + + if ($name -eq 'migrations') { + if ($state -eq 'exited' -and $exit -eq 0) { $good += $name } + elseif ($state -eq 'exited') { $bad += "${name} (exit=${exit})" } + else { $waiting += "${name} (${state})" } + continue + } + + if ($state -eq 'running') { + if ([string]::IsNullOrEmpty($health) -or $health -eq 'healthy') { + $good += $name + } elseif ($health -eq 'starting') { + $waiting += "${name} (starting)" + } elseif ($health -eq 'unhealthy') { + $bad += "${name} (unhealthy)" + } else { + $waiting += "${name} (${health})" + } + } elseif ($state -eq 'restarting') { + $bad += "${name} (restarting)" + } elseif ($state -eq 'exited') { + $bad += "${name} (exited, code=${exit})" + } else { + $waiting += "${name} (${state})" + } + } + + if ($bad.Count -gt 0) { + return @{ Ok = $false; Reason = 'failure'; Bad = $bad; Waiting = $waiting; Good = $good } + } + if ($waiting.Count -eq 0) { + return @{ Ok = $true; Reason = 'all_healthy'; Good = $good } + } + + $report = "Waiting on: " + ($waiting -join ', ') + if ($report -ne $lastReport) { + Write-Info $report + $lastReport = $report + } + Start-Sleep -Seconds 5 + } + + return @{ Ok = $false; Reason = 'timeout'; Bad = $bad; Waiting = $waiting; Good = $good } +} + +function Test-StaleZeroCacheVolume { + $raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null } + if ([string]::IsNullOrWhiteSpace($raw)) { return $false } + $names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ } + $hasZeroCache = $names -contains 'surfsense-zero-cache' + $hasZeroInit = $names -contains 'surfsense-zero-init' + # Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init. + # Such a volume may hold a half-initialized SQLite replica from an earlier + # crash-loop. Wiping it forces zero-cache to do a fresh initial sync. + return ($hasZeroCache -and -not $hasZeroInit) +} + +function Invoke-StaleZeroCacheCleanup { + if (-not (Test-StaleZeroCacheVolume)) { return } + + Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that" + Write-Warn "predates the migrations-service fix. It may contain a half-initialized" + Write-Warn "SQLite replica that would block zero-cache from starting." + Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel." + Start-Sleep -Seconds 5 + + Push-Location $InstallDir + Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null + Pop-Location + Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null + Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start." +} + +function Write-Err-NoExit { + param([string]$Message) + Write-Host "[ERROR] $Message" -ForegroundColor Red +} + +function Invoke-StackFailureReport { + param([hashtable]$Result) + + Write-Host "" + Write-Err-NoExit "Stack did not reach a healthy state." + if ($Result.Bad.Count -gt 0) { Write-Host (" Failed: " + ($Result.Bad -join ', ')) } + if ($Result.Waiting.Count -gt 0) { Write-Host (" Stuck: " + ($Result.Waiting -join ', ')) } + + Write-Host "" + Write-Info "Recent logs from migrations / zero-cache / backend:" + Push-Location $InstallDir + try { + Invoke-NativeSafe { docker compose logs --tail=60 migrations zero-cache backend 2>&1 } | Write-Host + } finally { + Pop-Location + } + + Write-Host "" + Write-Host "Recovery hints:" -ForegroundColor Yellow + Write-Host " 1. Inspect migrations: cd $InstallDir; docker compose logs migrations" + Write-Host " 2. Verify publication: cd $InstallDir; docker compose exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'" + Write-Host " 3. Hard reset zero db: cd $InstallDir; docker compose down; docker volume rm surfsense-zero-cache; docker compose up -d" + Write-Host "" + exit 1 +} + # ── Download files ────────────────────────────────────────────────────────── Write-Step "Downloading SurfSense files" @@ -191,6 +346,8 @@ if (-not (Test-Path $envPath)) { # ── Start containers ──────────────────────────────────────────────────────── +Invoke-StaleZeroCacheCleanup + if ($MigrationMode) { $envContent = Get-Content $envPath $DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1 @@ -251,7 +408,13 @@ if ($MigrationMode) { Push-Location $InstallDir Invoke-NativeSafe { docker compose up -d } Pop-Location - Write-Ok "All services started." + Write-Ok "All containers started; waiting for stack to become healthy..." + + $waitResult = Wait-StackHealthy -TimeoutSec 300 + if (-not $waitResult.Ok) { + Invoke-StackFailureReport -Result $waitResult + } + Write-Ok "All services healthy." Remove-Item $KeyFile -ErrorAction SilentlyContinue @@ -260,7 +423,13 @@ if ($MigrationMode) { Push-Location $InstallDir Invoke-NativeSafe { docker compose up -d } Pop-Location - Write-Ok "All services started." + Write-Ok "All containers started; waiting for stack to become healthy..." + + $waitResult = Wait-StackHealthy -TimeoutSec 300 + if (-not $waitResult.Ok) { + Invoke-StackFailureReport -Result $waitResult + } + Write-Ok "All services healthy." } # ── Watchtower (auto-update) ──────────────────────────────────────────────── diff --git a/docker/scripts/install.sh b/docker/scripts/install.sh index fcab4d55a..db81f95eb 100644 --- a/docker/scripts/install.sh +++ b/docker/scripts/install.sh @@ -97,6 +97,163 @@ wait_for_pg() { success "PostgreSQL is ready." } +# ── Stack health helpers ───────────────────────────────────────────────────── + +# Enumerate compose services for project `surfsense` as `service|state|health|exitcode` +# lines. Uses `docker inspect` so we don't depend on `jq`, `python3`, or the +# exact ordering of fields in `docker compose ps --format json` output. +get_compose_services() { + local containers + containers=$(docker ps -a --filter "label=com.docker.compose.project=surfsense" --format '{{.Names}}' 2>/dev/null) || true + [[ -z "$containers" ]] && return 0 + + while IFS= read -r container; do + [[ -z "$container" ]] && continue + local svc state health code + svc=$(docker inspect -f '{{index .Config.Labels "com.docker.compose.service"}}' "$container" 2>/dev/null || echo "") + state=$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || echo "unknown") + health=$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{end}}' "$container" 2>/dev/null || echo "") + code=$(docker inspect -f '{{.State.ExitCode}}' "$container" 2>/dev/null || echo "") + [[ -z "$svc" ]] && continue + printf '%s|%s|%s|%s\n' "$svc" "$state" "$health" "$code" + done <<< "$containers" +} + +# Globals populated by wait_stack_healthy / consumed by stack_failure_report. +STACK_BAD=() +STACK_WAITING=() +STACK_GOOD=() +STACK_TIMEOUT=false + +wait_stack_healthy() { + local timeout_sec=${1:-300} + local deadline=$(($(date +%s) + timeout_sec)) + local last_report="" + local bad=() + local waiting=() + local good=() + + while [[ $(date +%s) -lt $deadline ]]; do + local lines + lines=$(get_compose_services) + if [[ -z "$lines" ]]; then + sleep 3 + continue + fi + + bad=() + waiting=() + good=() + + while IFS='|' read -r name state health code; do + [[ -z "$name" ]] && continue + if [[ "$name" == "migrations" ]]; then + if [[ "$state" == "exited" && "$code" == "0" ]]; then + good+=("$name") + elif [[ "$state" == "exited" ]]; then + bad+=("${name} (exit=${code})") + else + waiting+=("${name} (${state})") + fi + continue + fi + + if [[ "$state" == "running" ]]; then + if [[ -z "$health" || "$health" == "healthy" ]]; then + good+=("$name") + elif [[ "$health" == "starting" ]]; then + waiting+=("${name} (starting)") + elif [[ "$health" == "unhealthy" ]]; then + bad+=("${name} (unhealthy)") + else + waiting+=("${name} (${health})") + fi + elif [[ "$state" == "restarting" ]]; then + bad+=("${name} (restarting)") + elif [[ "$state" == "exited" ]]; then + bad+=("${name} (exited, code=${code})") + else + waiting+=("${name} (${state})") + fi + done <<< "$lines" + + if (( ${#bad[@]} > 0 )); then + STACK_BAD=("${bad[@]}") + STACK_WAITING=("${waiting[@]}") + STACK_GOOD=("${good[@]}") + return 1 + fi + if (( ${#waiting[@]} == 0 )); then + STACK_GOOD=("${good[@]}") + return 0 + fi + + local report="Waiting on: ${waiting[*]}" + if [[ "$report" != "$last_report" ]]; then + info "$report" + last_report="$report" + fi + sleep 5 + done + + # bad/waiting/good are declared at function scope so referencing them is + # safe even if the polling loop never executed its body. + STACK_BAD=() + [[ ${#bad[@]} -gt 0 ]] && STACK_BAD=("${bad[@]}") + STACK_WAITING=() + [[ ${#waiting[@]} -gt 0 ]] && STACK_WAITING=("${waiting[@]}") + STACK_GOOD=() + [[ ${#good[@]} -gt 0 ]] && STACK_GOOD=("${good[@]}") + STACK_TIMEOUT=true + return 1 +} + +stack_failure_report() { + echo "" + echo -e "\033[31m[ERROR]\033[0m Stack did not reach a healthy state." + if (( ${#STACK_BAD[@]} > 0 )) && [[ -n "${STACK_BAD[0]}" ]]; then + echo " Failed: ${STACK_BAD[*]}" + fi + if (( ${#STACK_WAITING[@]} > 0 )) && [[ -n "${STACK_WAITING[0]}" ]]; then + echo " Stuck: ${STACK_WAITING[*]}" + fi + echo "" + info "Recent logs from migrations / zero-cache / backend:" + (cd "${INSTALL_DIR}" && ${DC} logs --tail=60 migrations zero-cache backend 2>&1) || true + echo "" + echo "Recovery hints:" + echo " 1. Inspect migrations: cd ${INSTALL_DIR} && ${DC} logs migrations" + echo " 2. Verify publication: cd ${INSTALL_DIR} && ${DC} exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'" + echo " 3. Hard reset zero db: cd ${INSTALL_DIR} && ${DC} down && docker volume rm surfsense-zero-cache && ${DC} up -d" + echo "" + exit 1 +} + +# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not. +# That signals an install that predates the migrations-service fix; the old +# replica may be half-initialized and would block zero-cache on next start. +test_stale_zero_cache_volume() { + local has_zc has_zi + has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true) + has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true) + [[ -n "$has_zc" && -z "$has_zi" ]] +} + +invoke_stale_zero_cache_cleanup() { + if ! test_stale_zero_cache_volume; then + return 0 + fi + warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that" + warn "predates the migrations-service fix. It may contain a half-initialized" + warn "SQLite replica that would block zero-cache from starting." + warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel." + sleep 5 + + (cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true + docker volume rm surfsense-zero-cache 2>/dev/null || true + success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start." +} + # ── Download files ─────────────────────────────────────────────────────────── step "Downloading SurfSense files" @@ -186,6 +343,8 @@ fi # ── Start containers ───────────────────────────────────────────────────────── +invoke_stale_zero_cache_cleanup + if $MIGRATION_MODE; then # Read DB credentials from .env (fall back to defaults from docker-compose.yml) DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true) @@ -243,7 +402,12 @@ if $MIGRATION_MODE; then step "Starting all SurfSense services" (cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null - success "All services started." + success "All containers started; waiting for stack to become healthy..." + + if ! wait_stack_healthy 300; then + stack_failure_report + fi + success "All services healthy." # Key file is no longer needed — SECRET_KEY is now in .env rm -f "${KEY_FILE}" @@ -251,7 +415,12 @@ if $MIGRATION_MODE; then else step "Starting SurfSense" (cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null - success "All services started." + success "All containers started; waiting for stack to become healthy..." + + if ! wait_stack_healthy 300; then + stack_failure_report + fi + success "All services healthy." fi # ── Watchtower (auto-update) ───────────────────────────────────────────────── diff --git a/surfsense_backend/Dockerfile b/surfsense_backend/Dockerfile index 6e1b2481e..0c783f403 100644 --- a/surfsense_backend/Dockerfile +++ b/surfsense_backend/Dockerfile @@ -167,10 +167,14 @@ COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh # SERVICE_ROLE controls which process this container runs: -# api – FastAPI backend only (runs migrations on startup) +# migrate – Run alembic upgrade head, verify zero_publication exists, exit 0. +# Used by the dedicated `migrations` service in docker-compose.yml +# so downstream services gate on `service_completed_successfully`. +# api – FastAPI backend only (does NOT run migrations) # worker – Celery worker only # beat – Celery beat scheduler only -# all – All three (legacy / dev default) +# all – migrations + api + worker + beat (legacy / dev default; +# fails fast on migration error) ENV SERVICE_ROLE=all # Celery worker tuning (only used when SERVICE_ROLE=worker or all) diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py index 5057e7d00..fc6242643 100644 --- a/surfsense_backend/app/app.py +++ b/surfsense_backend/app/app.py @@ -945,6 +945,36 @@ async def health_check(): return {"status": "ok"} +@app.get("/ready", tags=["health"]) +@limiter.exempt +async def readiness_check(): + """Readiness probe. + + Verifies that the schema state required by downstream services is + present. Specifically checks that the ``zero_publication`` Postgres + logical-replication publication exists; without it zero-cache crash-loops + on `Unknown or invalid publications`. + + Returns 200 when ready, 503 otherwise. Used by the docker-compose + backend healthcheck and by ``install.ps1`` / ``install.sh`` post-up + verification. + """ + from sqlalchemy import text + + from app.db import async_session_maker + + async with async_session_maker() as session: + result = await session.execute( + text("SELECT 1 FROM pg_publication WHERE pubname = 'zero_publication'") + ) + if result.first() is None: + raise HTTPException( + status_code=503, + detail="zero_publication missing; run alembic upgrade head", + ) + return {"status": "ready"} + + @app.get("/verify-token") async def authenticated_route( user: User = Depends(current_active_user), diff --git a/surfsense_backend/scripts/docker/entrypoint.sh b/surfsense_backend/scripts/docker/entrypoint.sh index 7bfcfce86..81db1ae84 100644 --- a/surfsense_backend/scripts/docker/entrypoint.sh +++ b/surfsense_backend/scripts/docker/entrypoint.sh @@ -4,10 +4,15 @@ set -e # ───────────────────────────────────────────────────────────── # SERVICE_ROLE controls which process(es) this container runs. # -# api – FastAPI backend only (runs migrations on startup) +# migrate – Run `alembic upgrade head`, verify zero_publication, +# then exit 0. Used by the dedicated `migrations` service +# in docker-compose.yml so downstream services can gate +# on `condition: service_completed_successfully`. +# api – FastAPI backend only (does NOT run migrations) # worker – Celery worker only # beat – Celery beat scheduler only -# all – All three in one container (legacy / dev default) +# all – migrations + api + worker + beat in one container +# (legacy / dev default; fails fast on migration error) # # Set SERVICE_ROLE as an environment variable in Coolify for # each service deployment. @@ -41,7 +46,13 @@ cleanup() { trap cleanup SIGTERM SIGINT -# ── Database migrations (only for api / all) ───────────────── +# ── Database migrations (only for migrate / all) ───────────── +# Fail-fast contract: +# - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s +# - zero_publication must exist in pg_publication afterwards +# Either failure exits non-zero so the dedicated `migrations` compose +# service exits non-zero, halting the rest of the stack instead of +# silently producing a half-built system that crash-loops zero-cache. run_migrations() { echo "Running database migrations..." for i in {1..30}; do @@ -53,11 +64,66 @@ run_migrations() { sleep 1 done - if timeout 300 alembic upgrade head 2>&1; then - echo "Migrations completed successfully." - else - echo "WARNING: Migration failed or timed out. Continuing anyway..." - echo "You may need to run migrations manually: alembic upgrade head" + local timeout_secs="${MIGRATION_TIMEOUT:-900}" + echo "Running alembic upgrade head (timeout=${timeout_secs}s)..." + if ! timeout "${timeout_secs}" alembic upgrade head; then + echo "ERROR: alembic upgrade head failed (or exceeded ${timeout_secs}s timeout)." >&2 + echo "Refusing to start. Inspect the error above and re-run." >&2 + exit 1 + fi + echo "Migrations completed successfully." + + echo "Verifying zero_publication exists in Postgres..." + local pub_oid + pub_oid=$(python <<'PY' 2>/dev/null || true +import asyncio +import sys +from sqlalchemy import text +from app.db import engine + + +async def get_oid(): + async with engine.connect() as conn: + result = await conn.execute( + text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'") + ) + row = result.first() + if row is None: + sys.exit(1) + print(int(row[0])) + + +asyncio.run(get_oid()) +PY +) + if [ -z "${pub_oid}" ]; then + echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2 + echo "This usually means migration 116 (or a later publication migration) did not run." >&2 + echo "Inspect alembic state with:" >&2 + echo " docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2 + exit 1 + fi + echo "zero_publication verified (oid=${pub_oid})." + + # Stale-replica safety net: if /zero-init is mounted (i.e. we are the + # dedicated `migrations` compose service), drop a marker file when the + # publication oid changed (or on first run) so the wrapped zero-cache + # entrypoint can wipe /data/zero.db before starting. This recovers from + # the case where a previous zero-cache crashed mid-init and left a + # half-built SQLite replica without a `_zero.tableMetadata` table. + if [ -d /zero-init ]; then + local stored_oid="" + [ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true) + if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then + echo "Publication oid changed (stored=${stored_oid:-}, current=${pub_oid}); writing /zero-init/needs_reset." + : > /zero-init/needs_reset + chmod 666 /zero-init/needs_reset 2>/dev/null || true + fi + echo "${pub_oid}" > /zero-init/last_pub_oid + chmod 666 /zero-init/last_pub_oid 2>/dev/null || true + # World-writable dir so the (possibly non-root) zero-cache container + # can `rm -f /zero-init/needs_reset` after acting on the marker. + chmod 777 /zero-init 2>/dev/null || true fi } @@ -102,8 +168,12 @@ start_beat() { # ── Main: run based on role ────────────────────────────────── case "${SERVICE_ROLE}" in - api) + migrate) run_migrations + echo "Migrations complete; exiting cleanly." + exit 0 + ;; + api) start_api ;; worker) @@ -121,7 +191,7 @@ case "${SERVICE_ROLE}" in start_beat ;; *) - echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: api, worker, beat, or all" + echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: migrate, api, worker, beat, or all" exit 1 ;; esac diff --git a/surfsense_web/content/docs/docker-installation/docker-compose.mdx b/surfsense_web/content/docs/docker-installation/docker-compose.mdx index bd7f579d0..0155969cd 100644 --- a/surfsense_web/content/docs/docker-installation/docker-compose.mdx +++ b/surfsense_web/content/docs/docker-installation/docker-compose.mdx @@ -71,7 +71,7 @@ Defaults work out of the box. Change `ZERO_ADMIN_PASSWORD` for security in produ | `ZERO_UPSTREAM_DB` | PostgreSQL connection URL for replication (must be a direct connection, not via pgbouncer) | *(built from DB_* vars)* | | `ZERO_CVR_DB` | PostgreSQL connection URL for client view records | *(built from DB_* vars)* | | `ZERO_CHANGE_DB` | PostgreSQL connection URL for replication log entries | *(built from DB_* vars)* | -| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116) | `zero_publication` | +| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116, verified by the `migrations` service before `zero-cache` starts) | `zero_publication` | | `ZERO_NUM_SYNC_WORKERS` | Number of view-sync worker processes. Must be ≤ connection pool sizes | `4` | | `ZERO_UPSTREAM_MAX_CONNS` | Max connections to upstream PostgreSQL for mutations | `20` | | `ZERO_CVR_MAX_CONNS` | Max connections to the CVR database | `30` | @@ -150,7 +150,9 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http | Service | Description | |---------|-------------| | `db` | PostgreSQL with pgvector extension | +| `migrations` | Short-lived: runs `alembic upgrade head` and verifies `zero_publication`, then exits | | `redis` | Message broker for Celery | +| `searxng` | Local privacy-respecting search backend | | `backend` | FastAPI application server | | `celery_worker` | Background task processing (document indexing, etc.) | | `celery_beat` | Periodic task scheduler (connector sync) | @@ -159,7 +161,42 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http All services start automatically with `docker compose up -d`. -The backend includes a health check. Dependent services (workers, frontend) wait until the API is fully ready before starting. You can monitor startup progress with `docker compose ps` (look for `(health: starting)` → `(healthy)`). +### How startup ordering works + +Schema migrations run as a dedicated `migrations` service that exits 0 on +success and non-zero on failure. Every other backend-image service gates on +it via `condition: service_completed_successfully`: + +```text +db (healthy) ──▶ migrations (alembic upgrade head + verify zero_publication) + │ + ├── exit 0 ─▶ backend ──▶ frontend + │ celery_worker + │ celery_beat + │ zero-cache ──▶ frontend + │ + └── exit ≠ 0 ─▶ compose halts the rest of the stack +``` + +This guarantees `zero-cache` only starts after `zero_publication` exists in +Postgres. Before this design, a silent migration failure would leave +`zero-cache` crash-looping with `Unknown or invalid publications. Specified: +[zero_publication]. Found: []`. + +### Readiness vs liveness + +The backend exposes two endpoints: + +- `GET /health` — lightweight liveness probe (always returns 200 if the + process is up). +- `GET /ready` — readiness probe that confirms `zero_publication` exists. + Returns 503 if not. The compose `backend.healthcheck` uses `/ready` so the + container only reports `healthy` once the schema is actually usable by + zero-cache. + +You can also monitor startup progress with `docker compose ps` (look for +`(health: starting)` → `(healthy)`). The install script polls these states +automatically and times out after 5 minutes if the stack does not converge. --- @@ -188,6 +225,90 @@ docker compose down -v - **Ports already in use**: Change the relevant `*_PORT` variable in `.env` and restart. - **Permission errors on Linux**: You may need to prefix `docker` commands with `sudo`. -- **Zero-cache not starting**: Check `docker compose logs zero-cache`. Ensure PostgreSQL has `wal_level=logical` (configured automatically by the bundled `postgresql.conf`). - **Real-time updates not working**: Open DevTools → Console and check for WebSocket errors. Verify `NEXT_PUBLIC_ZERO_CACHE_URL` matches the running zero-cache address. - **Line ending issues on Windows**: Run `git config --global core.autocrlf true` before cloning. + +### Migration service exited non-zero + +The `migrations` service exits non-zero in two cases: + +1. `alembic upgrade head` failed (timeout or SQL error). +2. `alembic` succeeded but `zero_publication` is still missing from + `pg_publication`. + +Inspect the logs and the alembic state: + +```bash +docker compose logs migrations +docker compose exec db psql -U surfsense -d surfsense \ + -c 'SELECT * FROM alembic_version;' +docker compose exec db psql -U surfsense -d surfsense \ + -c 'SELECT pubname FROM pg_publication;' +``` + +The default migration timeout is 900 seconds. Slow disks (Windows / WSL2) +may need more — set `MIGRATION_TIMEOUT` in `.env` to increase it. + +### Zero-cache stuck on `Unknown or invalid publications` + +Symptom (in `docker compose logs zero-cache`): + +```text +Error: Unknown or invalid publications. Specified: [zero_publication]. Found: [] +``` + +This means `zero-cache` started before `zero_publication` was created. With +the current compose files this should be impossible — the `migrations` +service blocks `zero-cache` from starting. If you see it, your stack +predates the fix or you brought up `zero-cache` manually with `docker +compose up zero-cache` before the migrations service ran. + +Recovery: + +```bash +docker compose down +docker volume rm surfsense-zero-cache # wipe half-built SQLite replica +docker compose up -d # migrations runs first, then zero-cache +``` + +The install script (`install.ps1` / `install.sh`) detects this case +automatically: if it finds a `surfsense-zero-cache` volume from a previous +install with no matching `surfsense-zero-init` volume, it removes the stale +volume before bringing the stack up. + +### Zero-cache crashes with `_zero.tableMetadata` errors + +This indicates a half-initialized SQLite replica left behind by a previous +crash. The `migrations` service writes a marker file on a shared volume +(`surfsense-zero-init`) when the publication oid changes; zero-cache wipes +its replica and re-syncs on next start. If the marker mechanism somehow did +not trigger, run the recovery one-liner above. + +### Ensuring `wal_level = logical` + +Logical replication is required by zero-cache. The bundled +`docker/postgresql.conf` sets `wal_level = logical` automatically. If you +swap in your own config or use a managed Postgres, confirm with: + +```bash +docker compose exec db psql -U surfsense -d surfsense \ + -c "SHOW wal_level;" +``` + +### Using `docker-compose.deps-only.yml` + +`docker-compose.deps-only.yml` runs only the dependencies (Postgres, Redis, +SearXNG, zero-cache) on Docker while the backend and frontend run on the +host. Because there is no backend container in this stack, there is no +`migrations` service either, and you must run alembic on the host **before** +bringing the stack up: + +```bash +cd surfsense_backend +uv run alembic upgrade head +cd ../docker +docker compose -f docker-compose.deps-only.yml up -d +``` + +If you skip the alembic step, `zero-cache` will crash-loop with `Unknown or +invalid publications. Specified: [zero_publication]`.