fix: docker one click setup

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-05-20 01:25:07 -07:00
parent 8174949b38
commit b285293b4e
10 changed files with 681 additions and 27 deletions

View file

@ -97,6 +97,161 @@ function Wait-ForPostgres {
Write-Ok "PostgreSQL is ready."
}
# ── Stack health helpers ────────────────────────────────────────────────────
function Get-ComposeServices {
Push-Location $InstallDir
try {
$raw = Invoke-NativeSafe { docker compose ps -a --format json 2>$null }
} finally {
Pop-Location
}
if ([string]::IsNullOrWhiteSpace($raw)) { return @() }
# Compose v2.21+ emits a JSON array; older versions emit one object per line.
try {
$parsed = $raw | ConvertFrom-Json
if ($parsed -is [System.Collections.IEnumerable] -and -not ($parsed -is [string])) {
return @($parsed)
}
return @($parsed)
} catch {
$services = @()
foreach ($line in ($raw -split "`r?`n")) {
$line = $line.Trim()
if (-not $line) { continue }
try { $services += ($line | ConvertFrom-Json) } catch { }
}
return $services
}
}
function Wait-StackHealthy {
param([int]$TimeoutSec = 300)
$deadline = (Get-Date).AddSeconds($TimeoutSec)
$lastReport = ""
while ((Get-Date) -lt $deadline) {
$services = Get-ComposeServices
if (-not $services -or $services.Count -eq 0) {
Start-Sleep -Seconds 3
continue
}
$bad = @()
$waiting = @()
$good = @()
foreach ($svc in $services) {
$name = $svc.Service
$state = $svc.State
$health = if ($svc.PSObject.Properties.Name -contains 'Health') { $svc.Health } else { '' }
$exit = if ($svc.PSObject.Properties.Name -contains 'ExitCode') { $svc.ExitCode } else { $null }
if ($name -eq 'migrations') {
if ($state -eq 'exited' -and $exit -eq 0) { $good += $name }
elseif ($state -eq 'exited') { $bad += "${name} (exit=${exit})" }
else { $waiting += "${name} (${state})" }
continue
}
if ($state -eq 'running') {
if ([string]::IsNullOrEmpty($health) -or $health -eq 'healthy') {
$good += $name
} elseif ($health -eq 'starting') {
$waiting += "${name} (starting)"
} elseif ($health -eq 'unhealthy') {
$bad += "${name} (unhealthy)"
} else {
$waiting += "${name} (${health})"
}
} elseif ($state -eq 'restarting') {
$bad += "${name} (restarting)"
} elseif ($state -eq 'exited') {
$bad += "${name} (exited, code=${exit})"
} else {
$waiting += "${name} (${state})"
}
}
if ($bad.Count -gt 0) {
return @{ Ok = $false; Reason = 'failure'; Bad = $bad; Waiting = $waiting; Good = $good }
}
if ($waiting.Count -eq 0) {
return @{ Ok = $true; Reason = 'all_healthy'; Good = $good }
}
$report = "Waiting on: " + ($waiting -join ', ')
if ($report -ne $lastReport) {
Write-Info $report
$lastReport = $report
}
Start-Sleep -Seconds 5
}
return @{ Ok = $false; Reason = 'timeout'; Bad = $bad; Waiting = $waiting; Good = $good }
}
function Test-StaleZeroCacheVolume {
$raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null }
if ([string]::IsNullOrWhiteSpace($raw)) { return $false }
$names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ }
$hasZeroCache = $names -contains 'surfsense-zero-cache'
$hasZeroInit = $names -contains 'surfsense-zero-init'
# Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init.
# Such a volume may hold a half-initialized SQLite replica from an earlier
# crash-loop. Wiping it forces zero-cache to do a fresh initial sync.
return ($hasZeroCache -and -not $hasZeroInit)
}
function Invoke-StaleZeroCacheCleanup {
if (-not (Test-StaleZeroCacheVolume)) { return }
Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
Write-Warn "predates the migrations-service fix. It may contain a half-initialized"
Write-Warn "SQLite replica that would block zero-cache from starting."
Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
Start-Sleep -Seconds 5
Push-Location $InstallDir
Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null
Pop-Location
Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null
Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
}
function Write-Err-NoExit {
param([string]$Message)
Write-Host "[ERROR] $Message" -ForegroundColor Red
}
function Invoke-StackFailureReport {
param([hashtable]$Result)
Write-Host ""
Write-Err-NoExit "Stack did not reach a healthy state."
if ($Result.Bad.Count -gt 0) { Write-Host (" Failed: " + ($Result.Bad -join ', ')) }
if ($Result.Waiting.Count -gt 0) { Write-Host (" Stuck: " + ($Result.Waiting -join ', ')) }
Write-Host ""
Write-Info "Recent logs from migrations / zero-cache / backend:"
Push-Location $InstallDir
try {
Invoke-NativeSafe { docker compose logs --tail=60 migrations zero-cache backend 2>&1 } | Write-Host
} finally {
Pop-Location
}
Write-Host ""
Write-Host "Recovery hints:" -ForegroundColor Yellow
Write-Host " 1. Inspect migrations: cd $InstallDir; docker compose logs migrations"
Write-Host " 2. Verify publication: cd $InstallDir; docker compose exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
Write-Host " 3. Hard reset zero db: cd $InstallDir; docker compose down; docker volume rm surfsense-zero-cache; docker compose up -d"
Write-Host ""
exit 1
}
# ── Download files ──────────────────────────────────────────────────────────
Write-Step "Downloading SurfSense files"
@ -191,6 +346,8 @@ if (-not (Test-Path $envPath)) {
# ── Start containers ────────────────────────────────────────────────────────
Invoke-StaleZeroCacheCleanup
if ($MigrationMode) {
$envContent = Get-Content $envPath
$DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1
@ -251,7 +408,13 @@ if ($MigrationMode) {
Push-Location $InstallDir
Invoke-NativeSafe { docker compose up -d }
Pop-Location
Write-Ok "All services started."
Write-Ok "All containers started; waiting for stack to become healthy..."
$waitResult = Wait-StackHealthy -TimeoutSec 300
if (-not $waitResult.Ok) {
Invoke-StackFailureReport -Result $waitResult
}
Write-Ok "All services healthy."
Remove-Item $KeyFile -ErrorAction SilentlyContinue
@ -260,7 +423,13 @@ if ($MigrationMode) {
Push-Location $InstallDir
Invoke-NativeSafe { docker compose up -d }
Pop-Location
Write-Ok "All services started."
Write-Ok "All containers started; waiting for stack to become healthy..."
$waitResult = Wait-StackHealthy -TimeoutSec 300
if (-not $waitResult.Ok) {
Invoke-StackFailureReport -Result $waitResult
}
Write-Ok "All services healthy."
}
# ── Watchtower (auto-update) ────────────────────────────────────────────────

View file

@ -97,6 +97,163 @@ wait_for_pg() {
success "PostgreSQL is ready."
}
# ── Stack health helpers ─────────────────────────────────────────────────────
# Enumerate compose services for project `surfsense` as `service|state|health|exitcode`
# lines. Uses `docker inspect` so we don't depend on `jq`, `python3`, or the
# exact ordering of fields in `docker compose ps --format json` output.
get_compose_services() {
local containers
containers=$(docker ps -a --filter "label=com.docker.compose.project=surfsense" --format '{{.Names}}' 2>/dev/null) || true
[[ -z "$containers" ]] && return 0
while IFS= read -r container; do
[[ -z "$container" ]] && continue
local svc state health code
svc=$(docker inspect -f '{{index .Config.Labels "com.docker.compose.service"}}' "$container" 2>/dev/null || echo "")
state=$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
health=$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{end}}' "$container" 2>/dev/null || echo "")
code=$(docker inspect -f '{{.State.ExitCode}}' "$container" 2>/dev/null || echo "")
[[ -z "$svc" ]] && continue
printf '%s|%s|%s|%s\n' "$svc" "$state" "$health" "$code"
done <<< "$containers"
}
# Globals populated by wait_stack_healthy / consumed by stack_failure_report.
STACK_BAD=()
STACK_WAITING=()
STACK_GOOD=()
STACK_TIMEOUT=false
wait_stack_healthy() {
local timeout_sec=${1:-300}
local deadline=$(($(date +%s) + timeout_sec))
local last_report=""
local bad=()
local waiting=()
local good=()
while [[ $(date +%s) -lt $deadline ]]; do
local lines
lines=$(get_compose_services)
if [[ -z "$lines" ]]; then
sleep 3
continue
fi
bad=()
waiting=()
good=()
while IFS='|' read -r name state health code; do
[[ -z "$name" ]] && continue
if [[ "$name" == "migrations" ]]; then
if [[ "$state" == "exited" && "$code" == "0" ]]; then
good+=("$name")
elif [[ "$state" == "exited" ]]; then
bad+=("${name} (exit=${code})")
else
waiting+=("${name} (${state})")
fi
continue
fi
if [[ "$state" == "running" ]]; then
if [[ -z "$health" || "$health" == "healthy" ]]; then
good+=("$name")
elif [[ "$health" == "starting" ]]; then
waiting+=("${name} (starting)")
elif [[ "$health" == "unhealthy" ]]; then
bad+=("${name} (unhealthy)")
else
waiting+=("${name} (${health})")
fi
elif [[ "$state" == "restarting" ]]; then
bad+=("${name} (restarting)")
elif [[ "$state" == "exited" ]]; then
bad+=("${name} (exited, code=${code})")
else
waiting+=("${name} (${state})")
fi
done <<< "$lines"
if (( ${#bad[@]} > 0 )); then
STACK_BAD=("${bad[@]}")
STACK_WAITING=("${waiting[@]}")
STACK_GOOD=("${good[@]}")
return 1
fi
if (( ${#waiting[@]} == 0 )); then
STACK_GOOD=("${good[@]}")
return 0
fi
local report="Waiting on: ${waiting[*]}"
if [[ "$report" != "$last_report" ]]; then
info "$report"
last_report="$report"
fi
sleep 5
done
# bad/waiting/good are declared at function scope so referencing them is
# safe even if the polling loop never executed its body.
STACK_BAD=()
[[ ${#bad[@]} -gt 0 ]] && STACK_BAD=("${bad[@]}")
STACK_WAITING=()
[[ ${#waiting[@]} -gt 0 ]] && STACK_WAITING=("${waiting[@]}")
STACK_GOOD=()
[[ ${#good[@]} -gt 0 ]] && STACK_GOOD=("${good[@]}")
STACK_TIMEOUT=true
return 1
}
stack_failure_report() {
echo ""
echo -e "\033[31m[ERROR]\033[0m Stack did not reach a healthy state."
if (( ${#STACK_BAD[@]} > 0 )) && [[ -n "${STACK_BAD[0]}" ]]; then
echo " Failed: ${STACK_BAD[*]}"
fi
if (( ${#STACK_WAITING[@]} > 0 )) && [[ -n "${STACK_WAITING[0]}" ]]; then
echo " Stuck: ${STACK_WAITING[*]}"
fi
echo ""
info "Recent logs from migrations / zero-cache / backend:"
(cd "${INSTALL_DIR}" && ${DC} logs --tail=60 migrations zero-cache backend 2>&1) || true
echo ""
echo "Recovery hints:"
echo " 1. Inspect migrations: cd ${INSTALL_DIR} && ${DC} logs migrations"
echo " 2. Verify publication: cd ${INSTALL_DIR} && ${DC} exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
echo " 3. Hard reset zero db: cd ${INSTALL_DIR} && ${DC} down && docker volume rm surfsense-zero-cache && ${DC} up -d"
echo ""
exit 1
}
# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not.
# That signals an install that predates the migrations-service fix; the old
# replica may be half-initialized and would block zero-cache on next start.
test_stale_zero_cache_volume() {
local has_zc has_zi
has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true)
has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true)
[[ -n "$has_zc" && -z "$has_zi" ]]
}
invoke_stale_zero_cache_cleanup() {
if ! test_stale_zero_cache_volume; then
return 0
fi
warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
warn "predates the migrations-service fix. It may contain a half-initialized"
warn "SQLite replica that would block zero-cache from starting."
warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
sleep 5
(cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true
docker volume rm surfsense-zero-cache 2>/dev/null || true
success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
}
# ── Download files ───────────────────────────────────────────────────────────
step "Downloading SurfSense files"
@ -186,6 +343,8 @@ fi
# ── Start containers ─────────────────────────────────────────────────────────
invoke_stale_zero_cache_cleanup
if $MIGRATION_MODE; then
# Read DB credentials from .env (fall back to defaults from docker-compose.yml)
DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true)
@ -243,7 +402,12 @@ if $MIGRATION_MODE; then
step "Starting all SurfSense services"
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
success "All services started."
success "All containers started; waiting for stack to become healthy..."
if ! wait_stack_healthy 300; then
stack_failure_report
fi
success "All services healthy."
# Key file is no longer needed — SECRET_KEY is now in .env
rm -f "${KEY_FILE}"
@ -251,7 +415,12 @@ if $MIGRATION_MODE; then
else
step "Starting SurfSense"
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
success "All services started."
success "All containers started; waiting for stack to become healthy..."
if ! wait_stack_healthy 300; then
stack_failure_report
fi
success "All services healthy."
fi
# ── Watchtower (auto-update) ─────────────────────────────────────────────────