fix: docker one click setup

2026-05-21 18:55:16 +02:00 · 2026-05-20 01:25:07 -07:00 · 2026-05-20 01:25:07 -07:00 · b285293b4e
commit b285293b4e
parent 8174949b38
10 changed files with 681 additions and 27 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,16 +6,15 @@ node_modules/
 .venv
 .pnpm-store
 .DS_Store
-deepagents/
 debug.log
-opencode/
+
+references/
+references

 # Playwright (E2E test artifacts)
 surfsense_web/playwright/.auth/
 surfsense_web/playwright-report/
 surfsense_web/test-results/
 surfsense_web/blob-report/
-hermes-agent
-hermes-agent/

 content_research/
--- a/docker/docker-compose.deps-only.yml
+++ b/docker/docker-compose.deps-only.yml
@ -20,6 +20,18 @@
 #   - Backend .env: SEARXNG_DEFAULT_HOST=http://localhost:${SEARXNG_PORT:-8888}
 #   - Backend .env: CELERY_BROKER_URL / REDIS_APP_URL → redis://localhost:6379/0
 #   - Web .env: NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:${ZERO_CACHE_PORT:-4848}
+#
+# IMPORTANT — schema migrations:
+#   This compose file does NOT build the backend image and therefore cannot
+#   run a `migrations` service. You MUST run alembic on the host before
+#   bringing zero-cache up, or zero-cache will crash-loop with
+#   `Unknown or invalid publications. Specified: [zero_publication]`.
+#
+#   First-time / after-pull:
+#     cd surfsense_backend && uv run alembic upgrade head
+#
+#   The other compose files (docker-compose.yml, docker-compose.dev.yml)
+#   handle this automatically via a dedicated `migrations` service.
 # =============================================================================

 name: surfsense-deps
@ -82,6 +94,10 @@ services:
      timeout: 5s
      retries: 5

+  # NOTE: zero-cache requires the `zero_publication` Postgres publication to
+  # exist before it starts. In this deps-only stack there is no backend
+  # container to run migrations, so you must run `uv run alembic upgrade head`
+  # from `surfsense_backend/` on the host BEFORE `docker compose up -d`.
  zero-cache:
    image: rocicorp/zero:1.4.0
    ports:
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@ -34,6 +34,25 @@ services:
      timeout: 5s
      retries: 5

+  # Short-lived schema runner; see docker/docker-compose.yml `migrations`
+  # service for the full rationale. Builds from the same backend context as
+  # the dev backend/celery services.
+  migrations:
+    build: *backend-build
+    env_file:
+      - ../surfsense_backend/.env
+    environment:
+      - DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
+      - PYTHONPATH=/app
+      - SERVICE_ROLE=migrate
+      - MIGRATION_TIMEOUT=${MIGRATION_TIMEOUT:-900}
+    volumes:
+      - zero_init:/zero-init
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: "no"
+
  pgadmin:
    image: dpage/pgadmin4
    ports:
@ -111,8 +130,10 @@ services:
        condition: service_healthy
      searxng:
        condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
      interval: 15s
      timeout: 5s
      retries: 30
@ -141,6 +162,8 @@ services:
        condition: service_healthy
      redis:
        condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
      backend:
        condition: service_healthy

@ -160,6 +183,8 @@ services:
        condition: service_healthy
      redis:
        condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
      celery_worker:
        condition: service_started

@ -185,8 +210,10 @@ services:
    extra_hosts:
      - "host.docker.internal:host-gateway"
    depends_on:
-      backend:
+      db:
        condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
    environment:
      - ZERO_UPSTREAM_DB=${ZERO_UPSTREAM_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
      - ZERO_CVR_DB=${ZERO_CVR_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
@ -201,6 +228,12 @@ services:
      - ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
    volumes:
      - zero_cache_data:/data
+      - zero_init:/zero-init
+    # Wrapper: see docker/docker-compose.yml `zero-cache` for rationale.
+    entrypoint: ["sh", "-c"]
+    # Pass the script as a single list element so Compose does not tokenize it.
+    command:
+      - 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
@ -238,3 +271,5 @@ volumes:
    name: surfsense-dev-shared-temp
  zero_cache_data:
    name: surfsense-dev-zero-cache
+  zero_init:
+    name: surfsense-dev-zero-init
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@ -27,6 +27,28 @@ services:
      timeout: 5s
      retries: 5

+  # Short-lived schema runner. Executes `alembic upgrade head` and verifies
+  # that the `zero_publication` Postgres logical-replication publication
+  # exists, then exits 0. Downstream services (backend, celery_*, zero-cache)
+  # gate on this with `condition: service_completed_successfully` so a failed
+  # migration halts the whole stack instead of silently producing a half-built
+  # system that crash-loops zero-cache on missing publications.
+  migrations:
+    image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}
+    env_file:
+      - .env
+    environment:
+      DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-surfsense}:${DB_PASSWORD:-surfsense}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
+      PYTHONPATH: /app
+      SERVICE_ROLE: migrate
+      MIGRATION_TIMEOUT: ${MIGRATION_TIMEOUT:-900}
+    volumes:
+      - zero_init:/zero-init
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: "no"
+
  redis:
    image: redis:8-alpine
    volumes:
@ -88,9 +110,11 @@ services:
        condition: service_healthy
      searxng:
        condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
    restart: unless-stopped
    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
      interval: 15s
      timeout: 5s
      retries: 30
@ -118,6 +142,8 @@ services:
        condition: service_healthy
      redis:
        condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
      backend:
        condition: service_healthy
    labels:
@ -140,6 +166,8 @@ services:
        condition: service_healthy
      redis:
        condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
      celery_worker:
        condition: service_started
    labels:
@ -182,10 +210,21 @@ services:
      ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
    volumes:
      - zero_cache_data:/data
+      - zero_init:/zero-init
+    # Wrapper: if the migrations service flagged a publication change via
+    # /zero-init/needs_reset, wipe the SQLite replica before starting so
+    # zero-cache does a clean initial sync. Recovers from the half-built
+    # replica state (`_zero.tableMetadata` missing) caused by earlier crashes.
+    entrypoint: ["sh", "-c"]
+    # Pass the script as a single list element so Compose does not tokenize it.
+    command:
+      - 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
    restart: unless-stopped
    depends_on:
-      backend:
+      db:
        condition: service_healthy
+      migrations:
+        condition: service_completed_successfully
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
      interval: 10s
@ -221,3 +260,5 @@ volumes:
    name: surfsense-shared-temp
  zero_cache_data:
    name: surfsense-zero-cache
+  zero_init:
+    name: surfsense-zero-init
--- a/docker/scripts/install.ps1
+++ b/docker/scripts/install.ps1
@ -97,6 +97,161 @@ function Wait-ForPostgres {
    Write-Ok "PostgreSQL is ready."
 }

+# ── Stack health helpers ────────────────────────────────────────────────────
+
+function Get-ComposeServices {
+    Push-Location $InstallDir
+    try {
+        $raw = Invoke-NativeSafe { docker compose ps -a --format json 2>$null }
+    } finally {
+        Pop-Location
+    }
+    if ([string]::IsNullOrWhiteSpace($raw)) { return @() }
+
+    # Compose v2.21+ emits a JSON array; older versions emit one object per line.
+    try {
+        $parsed = $raw | ConvertFrom-Json
+        if ($parsed -is [System.Collections.IEnumerable] -and -not ($parsed -is [string])) {
+            return @($parsed)
+        }
+        return @($parsed)
+    } catch {
+        $services = @()
+        foreach ($line in ($raw -split "`r?`n")) {
+            $line = $line.Trim()
+            if (-not $line) { continue }
+            try { $services += ($line | ConvertFrom-Json) } catch { }
+        }
+        return $services
+    }
+}
+
+function Wait-StackHealthy {
+    param([int]$TimeoutSec = 300)
+
+    $deadline = (Get-Date).AddSeconds($TimeoutSec)
+    $lastReport = ""
+
+    while ((Get-Date) -lt $deadline) {
+        $services = Get-ComposeServices
+        if (-not $services -or $services.Count -eq 0) {
+            Start-Sleep -Seconds 3
+            continue
+        }
+
+        $bad = @()
+        $waiting = @()
+        $good = @()
+
+        foreach ($svc in $services) {
+            $name = $svc.Service
+            $state = $svc.State
+            $health = if ($svc.PSObject.Properties.Name -contains 'Health') { $svc.Health } else { '' }
+            $exit = if ($svc.PSObject.Properties.Name -contains 'ExitCode') { $svc.ExitCode } else { $null }
+
+            if ($name -eq 'migrations') {
+                if ($state -eq 'exited' -and $exit -eq 0) { $good += $name }
+                elseif ($state -eq 'exited') { $bad += "${name} (exit=${exit})" }
+                else { $waiting += "${name} (${state})" }
+                continue
+            }
+
+            if ($state -eq 'running') {
+                if ([string]::IsNullOrEmpty($health) -or $health -eq 'healthy') {
+                    $good += $name
+                } elseif ($health -eq 'starting') {
+                    $waiting += "${name} (starting)"
+                } elseif ($health -eq 'unhealthy') {
+                    $bad += "${name} (unhealthy)"
+                } else {
+                    $waiting += "${name} (${health})"
+                }
+            } elseif ($state -eq 'restarting') {
+                $bad += "${name} (restarting)"
+            } elseif ($state -eq 'exited') {
+                $bad += "${name} (exited, code=${exit})"
+            } else {
+                $waiting += "${name} (${state})"
+            }
+        }
+
+        if ($bad.Count -gt 0) {
+            return @{ Ok = $false; Reason = 'failure'; Bad = $bad; Waiting = $waiting; Good = $good }
+        }
+        if ($waiting.Count -eq 0) {
+            return @{ Ok = $true; Reason = 'all_healthy'; Good = $good }
+        }
+
+        $report = "Waiting on: " + ($waiting -join ', ')
+        if ($report -ne $lastReport) {
+            Write-Info $report
+            $lastReport = $report
+        }
+        Start-Sleep -Seconds 5
+    }
+
+    return @{ Ok = $false; Reason = 'timeout'; Bad = $bad; Waiting = $waiting; Good = $good }
+}
+
+function Test-StaleZeroCacheVolume {
+    $raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null }
+    if ([string]::IsNullOrWhiteSpace($raw)) { return $false }
+    $names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ }
+    $hasZeroCache = $names -contains 'surfsense-zero-cache'
+    $hasZeroInit = $names -contains 'surfsense-zero-init'
+    # Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init.
+    # Such a volume may hold a half-initialized SQLite replica from an earlier
+    # crash-loop. Wiping it forces zero-cache to do a fresh initial sync.
+    return ($hasZeroCache -and -not $hasZeroInit)
+}
+
+function Invoke-StaleZeroCacheCleanup {
+    if (-not (Test-StaleZeroCacheVolume)) { return }
+
+    Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
+    Write-Warn "predates the migrations-service fix. It may contain a half-initialized"
+    Write-Warn "SQLite replica that would block zero-cache from starting."
+    Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
+    Start-Sleep -Seconds 5
+
+    Push-Location $InstallDir
+    Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null
+    Pop-Location
+    Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null
+    Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
+}
+
+function Write-Err-NoExit {
+    param([string]$Message)
+    Write-Host "[ERROR] $Message" -ForegroundColor Red
+}
+
+function Invoke-StackFailureReport {
+    param([hashtable]$Result)
+
+    Write-Host ""
+    Write-Err-NoExit "Stack did not reach a healthy state."
+    if ($Result.Bad.Count -gt 0) { Write-Host ("  Failed: " + ($Result.Bad -join ', ')) }
+    if ($Result.Waiting.Count -gt 0) { Write-Host ("  Stuck:  " + ($Result.Waiting -join ', ')) }
+
+    Write-Host ""
+    Write-Info "Recent logs from migrations / zero-cache / backend:"
+    Push-Location $InstallDir
+    try {
+        Invoke-NativeSafe { docker compose logs --tail=60 migrations zero-cache backend 2>&1 } | Write-Host
+    } finally {
+        Pop-Location
+    }
+
+    Write-Host ""
+    Write-Host "Recovery hints:" -ForegroundColor Yellow
+    Write-Host "  1. Inspect migrations:   cd $InstallDir; docker compose logs migrations"
+    Write-Host "  2. Verify publication:   cd $InstallDir; docker compose exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
+    Write-Host "  3. Hard reset zero db:   cd $InstallDir; docker compose down; docker volume rm surfsense-zero-cache; docker compose up -d"
+    Write-Host ""
+    exit 1
+}
+
 # ── Download files ──────────────────────────────────────────────────────────

 Write-Step "Downloading SurfSense files"
@ -191,6 +346,8 @@ if (-not (Test-Path $envPath)) {

 # ── Start containers ────────────────────────────────────────────────────────

+Invoke-StaleZeroCacheCleanup
+
 if ($MigrationMode) {
    $envContent = Get-Content $envPath
    $DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1
@ -251,7 +408,13 @@ if ($MigrationMode) {
    Push-Location $InstallDir
    Invoke-NativeSafe { docker compose up -d }
    Pop-Location
-    Write-Ok "All services started."
+    Write-Ok "All containers started; waiting for stack to become healthy..."
+
+    $waitResult = Wait-StackHealthy -TimeoutSec 300
+    if (-not $waitResult.Ok) {
+        Invoke-StackFailureReport -Result $waitResult
+    }
+    Write-Ok "All services healthy."

    Remove-Item $KeyFile -ErrorAction SilentlyContinue

@ -260,7 +423,13 @@ if ($MigrationMode) {
    Push-Location $InstallDir
    Invoke-NativeSafe { docker compose up -d }
    Pop-Location
-    Write-Ok "All services started."
+    Write-Ok "All containers started; waiting for stack to become healthy..."
+
+    $waitResult = Wait-StackHealthy -TimeoutSec 300
+    if (-not $waitResult.Ok) {
+        Invoke-StackFailureReport -Result $waitResult
+    }
+    Write-Ok "All services healthy."
 }

 # ── Watchtower (auto-update) ────────────────────────────────────────────────
--- a/docker/scripts/install.sh
+++ b/docker/scripts/install.sh
@ -97,6 +97,163 @@ wait_for_pg() {
    success "PostgreSQL is ready."
 }

+# ── Stack health helpers ─────────────────────────────────────────────────────
+
+# Enumerate compose services for project `surfsense` as `service|state|health|exitcode`
+# lines. Uses `docker inspect` so we don't depend on `jq`, `python3`, or the
+# exact ordering of fields in `docker compose ps --format json` output.
+get_compose_services() {
+    local containers
+    containers=$(docker ps -a --filter "label=com.docker.compose.project=surfsense" --format '{{.Names}}' 2>/dev/null) || true
+    [[ -z "$containers" ]] && return 0
+
+    while IFS= read -r container; do
+        [[ -z "$container" ]] && continue
+        local svc state health code
+        svc=$(docker inspect -f '{{index .Config.Labels "com.docker.compose.service"}}' "$container" 2>/dev/null || echo "")
+        state=$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
+        health=$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{end}}' "$container" 2>/dev/null || echo "")
+        code=$(docker inspect -f '{{.State.ExitCode}}' "$container" 2>/dev/null || echo "")
+        [[ -z "$svc" ]] && continue
+        printf '%s|%s|%s|%s\n' "$svc" "$state" "$health" "$code"
+    done <<< "$containers"
+}
+
+# Globals populated by wait_stack_healthy / consumed by stack_failure_report.
+STACK_BAD=()
+STACK_WAITING=()
+STACK_GOOD=()
+STACK_TIMEOUT=false
+
+wait_stack_healthy() {
+    local timeout_sec=${1:-300}
+    local deadline=$(($(date +%s) + timeout_sec))
+    local last_report=""
+    local bad=()
+    local waiting=()
+    local good=()
+
+    while [[ $(date +%s) -lt $deadline ]]; do
+        local lines
+        lines=$(get_compose_services)
+        if [[ -z "$lines" ]]; then
+            sleep 3
+            continue
+        fi
+
+        bad=()
+        waiting=()
+        good=()
+
+        while IFS='|' read -r name state health code; do
+            [[ -z "$name" ]] && continue
+            if [[ "$name" == "migrations" ]]; then
+                if [[ "$state" == "exited" && "$code" == "0" ]]; then
+                    good+=("$name")
+                elif [[ "$state" == "exited" ]]; then
+                    bad+=("${name} (exit=${code})")
+                else
+                    waiting+=("${name} (${state})")
+                fi
+                continue
+            fi
+
+            if [[ "$state" == "running" ]]; then
+                if [[ -z "$health" || "$health" == "healthy" ]]; then
+                    good+=("$name")
+                elif [[ "$health" == "starting" ]]; then
+                    waiting+=("${name} (starting)")
+                elif [[ "$health" == "unhealthy" ]]; then
+                    bad+=("${name} (unhealthy)")
+                else
+                    waiting+=("${name} (${health})")
+                fi
+            elif [[ "$state" == "restarting" ]]; then
+                bad+=("${name} (restarting)")
+            elif [[ "$state" == "exited" ]]; then
+                bad+=("${name} (exited, code=${code})")
+            else
+                waiting+=("${name} (${state})")
+            fi
+        done <<< "$lines"
+
+        if (( ${#bad[@]} > 0 )); then
+            STACK_BAD=("${bad[@]}")
+            STACK_WAITING=("${waiting[@]}")
+            STACK_GOOD=("${good[@]}")
+            return 1
+        fi
+        if (( ${#waiting[@]} == 0 )); then
+            STACK_GOOD=("${good[@]}")
+            return 0
+        fi
+
+        local report="Waiting on: ${waiting[*]}"
+        if [[ "$report" != "$last_report" ]]; then
+            info "$report"
+            last_report="$report"
+        fi
+        sleep 5
+    done
+
+    # bad/waiting/good are declared at function scope so referencing them is
+    # safe even if the polling loop never executed its body.
+    STACK_BAD=()
+    [[ ${#bad[@]} -gt 0 ]] && STACK_BAD=("${bad[@]}")
+    STACK_WAITING=()
+    [[ ${#waiting[@]} -gt 0 ]] && STACK_WAITING=("${waiting[@]}")
+    STACK_GOOD=()
+    [[ ${#good[@]} -gt 0 ]] && STACK_GOOD=("${good[@]}")
+    STACK_TIMEOUT=true
+    return 1
+}
+
+stack_failure_report() {
+    echo ""
+    echo -e "\033[31m[ERROR]\033[0m Stack did not reach a healthy state."
+    if (( ${#STACK_BAD[@]} > 0 )) && [[ -n "${STACK_BAD[0]}" ]]; then
+        echo "  Failed: ${STACK_BAD[*]}"
+    fi
+    if (( ${#STACK_WAITING[@]} > 0 )) && [[ -n "${STACK_WAITING[0]}" ]]; then
+        echo "  Stuck:  ${STACK_WAITING[*]}"
+    fi
+    echo ""
+    info "Recent logs from migrations / zero-cache / backend:"
+    (cd "${INSTALL_DIR}" && ${DC} logs --tail=60 migrations zero-cache backend 2>&1) || true
+    echo ""
+    echo "Recovery hints:"
+    echo "  1. Inspect migrations:   cd ${INSTALL_DIR} && ${DC} logs migrations"
+    echo "  2. Verify publication:   cd ${INSTALL_DIR} && ${DC} exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
+    echo "  3. Hard reset zero db:   cd ${INSTALL_DIR} && ${DC} down && docker volume rm surfsense-zero-cache && ${DC} up -d"
+    echo ""
+    exit 1
+}
+
+# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not.
+# That signals an install that predates the migrations-service fix; the old
+# replica may be half-initialized and would block zero-cache on next start.
+test_stale_zero_cache_volume() {
+    local has_zc has_zi
+    has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true)
+    has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true)
+    [[ -n "$has_zc" && -z "$has_zi" ]]
+}
+
+invoke_stale_zero_cache_cleanup() {
+    if ! test_stale_zero_cache_volume; then
+        return 0
+    fi
+    warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
+    warn "predates the migrations-service fix. It may contain a half-initialized"
+    warn "SQLite replica that would block zero-cache from starting."
+    warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
+    sleep 5
+
+    (cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true
+    docker volume rm surfsense-zero-cache 2>/dev/null || true
+    success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
+}
+
 # ── Download files ───────────────────────────────────────────────────────────

 step "Downloading SurfSense files"
@ -186,6 +343,8 @@ fi

 # ── Start containers ─────────────────────────────────────────────────────────

+invoke_stale_zero_cache_cleanup
+
 if $MIGRATION_MODE; then
    # Read DB credentials from .env (fall back to defaults from docker-compose.yml)
    DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true)
@ -243,7 +402,12 @@ if $MIGRATION_MODE; then

    step "Starting all SurfSense services"
    (cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
-    success "All services started."
+    success "All containers started; waiting for stack to become healthy..."
+
+    if ! wait_stack_healthy 300; then
+        stack_failure_report
+    fi
+    success "All services healthy."

    # Key file is no longer needed — SECRET_KEY is now in .env
    rm -f "${KEY_FILE}"
@ -251,7 +415,12 @@ if $MIGRATION_MODE; then
 else
    step "Starting SurfSense"
    (cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
-    success "All services started."
+    success "All containers started; waiting for stack to become healthy..."
+
+    if ! wait_stack_healthy 300; then
+        stack_failure_report
+    fi
+    success "All services healthy."
 fi

 # ── Watchtower (auto-update) ─────────────────────────────────────────────────
--- a/surfsense_backend/Dockerfile
+++ b/surfsense_backend/Dockerfile
@ -167,10 +167,14 @@ COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
 RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh

 # SERVICE_ROLE controls which process this container runs:
-#   api     – FastAPI backend only (runs migrations on startup)
+#   migrate – Run alembic upgrade head, verify zero_publication exists, exit 0.
+#             Used by the dedicated `migrations` service in docker-compose.yml
+#             so downstream services gate on `service_completed_successfully`.
+#   api     – FastAPI backend only (does NOT run migrations)
 #   worker  – Celery worker only
 #   beat    – Celery beat scheduler only
-#   all     – All three (legacy / dev default)
+#   all     – migrations + api + worker + beat (legacy / dev default;
+#             fails fast on migration error)
 ENV SERVICE_ROLE=all

 # Celery worker tuning (only used when SERVICE_ROLE=worker or all)
--- a/surfsense_backend/app/app.py
+++ b/surfsense_backend/app/app.py
@ -945,6 +945,36 @@ async def health_check():
    return {"status": "ok"}


+@app.get("/ready", tags=["health"])
+@limiter.exempt
+async def readiness_check():
+    """Readiness probe.
+
+    Verifies that the schema state required by downstream services is
+    present. Specifically checks that the ``zero_publication`` Postgres
+    logical-replication publication exists; without it zero-cache crash-loops
+    on `Unknown or invalid publications`.
+
+    Returns 200 when ready, 503 otherwise. Used by the docker-compose
+    backend healthcheck and by ``install.ps1`` / ``install.sh`` post-up
+    verification.
+    """
+    from sqlalchemy import text
+
+    from app.db import async_session_maker
+
+    async with async_session_maker() as session:
+        result = await session.execute(
+            text("SELECT 1 FROM pg_publication WHERE pubname = 'zero_publication'")
+        )
+        if result.first() is None:
+            raise HTTPException(
+                status_code=503,
+                detail="zero_publication missing; run alembic upgrade head",
+            )
+    return {"status": "ready"}
+
+
@app.get("/verify-token")
 async def authenticated_route(
    user: User = Depends(current_active_user),
--- a/surfsense_backend/scripts/docker/entrypoint.sh
+++ b/surfsense_backend/scripts/docker/entrypoint.sh
@ -4,10 +4,15 @@ set -e
 # ─────────────────────────────────────────────────────────────
 # SERVICE_ROLE controls which process(es) this container runs.
 #
-#   api     – FastAPI backend only  (runs migrations on startup)
+#   migrate – Run `alembic upgrade head`, verify zero_publication,
+#             then exit 0. Used by the dedicated `migrations` service
+#             in docker-compose.yml so downstream services can gate
+#             on `condition: service_completed_successfully`.
+#   api     – FastAPI backend only (does NOT run migrations)
 #   worker  – Celery worker only
 #   beat    – Celery beat scheduler only
-#   all     – All three in one container (legacy / dev default)
+#   all     – migrations + api + worker + beat in one container
+#             (legacy / dev default; fails fast on migration error)
 #
 # Set SERVICE_ROLE as an environment variable in Coolify for
 # each service deployment.
@ -41,7 +46,13 @@ cleanup() {

 trap cleanup SIGTERM SIGINT

-# ── Database migrations (only for api / all) ─────────────────
+# ── Database migrations (only for migrate / all) ─────────────
+# Fail-fast contract:
+#   - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s
+#   - zero_publication must exist in pg_publication afterwards
+# Either failure exits non-zero so the dedicated `migrations` compose
+# service exits non-zero, halting the rest of the stack instead of
+# silently producing a half-built system that crash-loops zero-cache.
 run_migrations() {
    echo "Running database migrations..."
    for i in {1..30}; do
@ -53,11 +64,66 @@ run_migrations() {
        sleep 1
    done

-    if timeout 300 alembic upgrade head 2>&1; then
-        echo "Migrations completed successfully."
-    else
-        echo "WARNING: Migration failed or timed out. Continuing anyway..."
-        echo "You may need to run migrations manually: alembic upgrade head"
+    local timeout_secs="${MIGRATION_TIMEOUT:-900}"
+    echo "Running alembic upgrade head (timeout=${timeout_secs}s)..."
+    if ! timeout "${timeout_secs}" alembic upgrade head; then
+        echo "ERROR: alembic upgrade head failed (or exceeded ${timeout_secs}s timeout)." >&2
+        echo "Refusing to start. Inspect the error above and re-run." >&2
+        exit 1
+    fi
+    echo "Migrations completed successfully."
+
+    echo "Verifying zero_publication exists in Postgres..."
+    local pub_oid
+    pub_oid=$(python <<'PY' 2>/dev/null || true
+import asyncio
+import sys
+from sqlalchemy import text
+from app.db import engine
+
+
+async def get_oid():
+    async with engine.connect() as conn:
+        result = await conn.execute(
+            text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'")
+        )
+        row = result.first()
+        if row is None:
+            sys.exit(1)
+        print(int(row[0]))
+
+
+asyncio.run(get_oid())
+PY
+)
+    if [ -z "${pub_oid}" ]; then
+        echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2
+        echo "This usually means migration 116 (or a later publication migration) did not run." >&2
+        echo "Inspect alembic state with:" >&2
+        echo "  docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2
+        exit 1
+    fi
+    echo "zero_publication verified (oid=${pub_oid})."
+
+    # Stale-replica safety net: if /zero-init is mounted (i.e. we are the
+    # dedicated `migrations` compose service), drop a marker file when the
+    # publication oid changed (or on first run) so the wrapped zero-cache
+    # entrypoint can wipe /data/zero.db before starting. This recovers from
+    # the case where a previous zero-cache crashed mid-init and left a
+    # half-built SQLite replica without a `_zero.tableMetadata` table.
+    if [ -d /zero-init ]; then
+        local stored_oid=""
+        [ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true)
+        if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then
+            echo "Publication oid changed (stored=${stored_oid:-<none>}, current=${pub_oid}); writing /zero-init/needs_reset."
+            : > /zero-init/needs_reset
+            chmod 666 /zero-init/needs_reset 2>/dev/null || true
+        fi
+        echo "${pub_oid}" > /zero-init/last_pub_oid
+        chmod 666 /zero-init/last_pub_oid 2>/dev/null || true
+        # World-writable dir so the (possibly non-root) zero-cache container
+        # can `rm -f /zero-init/needs_reset` after acting on the marker.
+        chmod 777 /zero-init 2>/dev/null || true
    fi
 }

@ -102,8 +168,12 @@ start_beat() {

 # ── Main: run based on role ──────────────────────────────────
 case "${SERVICE_ROLE}" in
-    api)
+    migrate)
        run_migrations
+        echo "Migrations complete; exiting cleanly."
+        exit 0
+        ;;
+    api)
        start_api
        ;;
    worker)
@ -121,7 +191,7 @@ case "${SERVICE_ROLE}" in
        start_beat
        ;;
    *)
-        echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: api, worker, beat, or all"
+        echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: migrate, api, worker, beat, or all"
        exit 1
        ;;
 esac
--- a/surfsense_web/content/docs/docker-installation/docker-compose.mdx
+++ b/surfsense_web/content/docs/docker-installation/docker-compose.mdx
@ -71,7 +71,7 @@ Defaults work out of the box. Change `ZERO_ADMIN_PASSWORD` for security in produ
 | `ZERO_UPSTREAM_DB` | PostgreSQL connection URL for replication (must be a direct connection, not via pgbouncer) | *(built from DB_* vars)* |
 | `ZERO_CVR_DB` | PostgreSQL connection URL for client view records | *(built from DB_* vars)* |
 | `ZERO_CHANGE_DB` | PostgreSQL connection URL for replication log entries | *(built from DB_* vars)* |
-| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116) | `zero_publication` |
+| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116, verified by the `migrations` service before `zero-cache` starts) | `zero_publication` |
 | `ZERO_NUM_SYNC_WORKERS` | Number of view-sync worker processes. Must be ≤ connection pool sizes | `4` |
 | `ZERO_UPSTREAM_MAX_CONNS` | Max connections to upstream PostgreSQL for mutations | `20` |
 | `ZERO_CVR_MAX_CONNS` | Max connections to the CVR database | `30` |
@ -150,7 +150,9 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http
 | Service | Description |
 |---------|-------------|
 | `db` | PostgreSQL with pgvector extension |
+| `migrations` | Short-lived: runs `alembic upgrade head` and verifies `zero_publication`, then exits |
 | `redis` | Message broker for Celery |
+| `searxng` | Local privacy-respecting search backend |
 | `backend` | FastAPI application server |
 | `celery_worker` | Background task processing (document indexing, etc.) |
 | `celery_beat` | Periodic task scheduler (connector sync) |
@ -159,7 +161,42 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http

 All services start automatically with `docker compose up -d`.

-The backend includes a health check. Dependent services (workers, frontend) wait until the API is fully ready before starting. You can monitor startup progress with `docker compose ps` (look for `(health: starting)` → `(healthy)`).
+### How startup ordering works
+
+Schema migrations run as a dedicated `migrations` service that exits 0 on
+success and non-zero on failure. Every other backend-image service gates on
+it via `condition: service_completed_successfully`:
+
+```text
+db (healthy) ──▶ migrations (alembic upgrade head + verify zero_publication)
+                     │
+                     ├── exit 0 ─▶ backend ──▶ frontend
+                     │            celery_worker
+                     │            celery_beat
+                     │            zero-cache ──▶ frontend
+                     │
+                     └── exit ≠ 0 ─▶ compose halts the rest of the stack
+```
+
+This guarantees `zero-cache` only starts after `zero_publication` exists in
+Postgres. Before this design, a silent migration failure would leave
+`zero-cache` crash-looping with `Unknown or invalid publications. Specified:
+[zero_publication]. Found: []`.
+
+### Readiness vs liveness
+
+The backend exposes two endpoints:
+
+- `GET /health` — lightweight liveness probe (always returns 200 if the
+  process is up).
+- `GET /ready` — readiness probe that confirms `zero_publication` exists.
+  Returns 503 if not. The compose `backend.healthcheck` uses `/ready` so the
+  container only reports `healthy` once the schema is actually usable by
+  zero-cache.
+
+You can also monitor startup progress with `docker compose ps` (look for
+`(health: starting)` → `(healthy)`). The install script polls these states
+automatically and times out after 5 minutes if the stack does not converge.

 ---

@ -188,6 +225,90 @@ docker compose down -v

 - **Ports already in use**: Change the relevant `*_PORT` variable in `.env` and restart.
 - **Permission errors on Linux**: You may need to prefix `docker` commands with `sudo`.
- **Zero-cache not starting**: Check `docker compose logs zero-cache`. Ensure PostgreSQL has `wal_level=logical` (configured automatically by the bundled `postgresql.conf`).
 - **Real-time updates not working**: Open DevTools → Console and check for WebSocket errors. Verify `NEXT_PUBLIC_ZERO_CACHE_URL` matches the running zero-cache address.
 - **Line ending issues on Windows**: Run `git config --global core.autocrlf true` before cloning.
+
+### Migration service exited non-zero
+
+The `migrations` service exits non-zero in two cases:
+
+1. `alembic upgrade head` failed (timeout or SQL error).
+2. `alembic` succeeded but `zero_publication` is still missing from
+   `pg_publication`.
+
+Inspect the logs and the alembic state:
+
+```bash
+docker compose logs migrations
+docker compose exec db psql -U surfsense -d surfsense \
+  -c 'SELECT * FROM alembic_version;'
+docker compose exec db psql -U surfsense -d surfsense \
+  -c 'SELECT pubname FROM pg_publication;'
+```
+
+The default migration timeout is 900 seconds. Slow disks (Windows / WSL2)
+may need more — set `MIGRATION_TIMEOUT` in `.env` to increase it.
+
+### Zero-cache stuck on `Unknown or invalid publications`
+
+Symptom (in `docker compose logs zero-cache`):
+
+```text
+Error: Unknown or invalid publications. Specified: [zero_publication]. Found: []
+```
+
+This means `zero-cache` started before `zero_publication` was created. With
+the current compose files this should be impossible — the `migrations`
+service blocks `zero-cache` from starting. If you see it, your stack
+predates the fix or you brought up `zero-cache` manually with `docker
+compose up zero-cache` before the migrations service ran.
+
+Recovery:
+
+```bash
+docker compose down
+docker volume rm surfsense-zero-cache   # wipe half-built SQLite replica
+docker compose up -d                    # migrations runs first, then zero-cache
+```
+
+The install script (`install.ps1` / `install.sh`) detects this case
+automatically: if it finds a `surfsense-zero-cache` volume from a previous
+install with no matching `surfsense-zero-init` volume, it removes the stale
+volume before bringing the stack up.
+
+### Zero-cache crashes with `_zero.tableMetadata` errors
+
+This indicates a half-initialized SQLite replica left behind by a previous
+crash. The `migrations` service writes a marker file on a shared volume
+(`surfsense-zero-init`) when the publication oid changes; zero-cache wipes
+its replica and re-syncs on next start. If the marker mechanism somehow did
+not trigger, run the recovery one-liner above.
+
+### Ensuring `wal_level = logical`
+
+Logical replication is required by zero-cache. The bundled
+`docker/postgresql.conf` sets `wal_level = logical` automatically. If you
+swap in your own config or use a managed Postgres, confirm with:
+
+```bash
+docker compose exec db psql -U surfsense -d surfsense \
+  -c "SHOW wal_level;"
+```
+
+### Using `docker-compose.deps-only.yml`
+
+`docker-compose.deps-only.yml` runs only the dependencies (Postgres, Redis,
+SearXNG, zero-cache) on Docker while the backend and frontend run on the
+host. Because there is no backend container in this stack, there is no
+`migrations` service either, and you must run alembic on the host **before**
+bringing the stack up:
+
+```bash
+cd surfsense_backend
+uv run alembic upgrade head
+cd ../docker
+docker compose -f docker-compose.deps-only.yml up -d
+```
+
+If you skip the alembic step, `zero-cache` will crash-loop with `Unknown or
+invalid publications. Specified: [zero_publication]`.