mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-23 19:05:16 +02:00
fix: docker one click setup
This commit is contained in:
parent
8174949b38
commit
b285293b4e
10 changed files with 681 additions and 27 deletions
7
.gitignore
vendored
7
.gitignore
vendored
|
|
@ -6,16 +6,15 @@ node_modules/
|
||||||
.venv
|
.venv
|
||||||
.pnpm-store
|
.pnpm-store
|
||||||
.DS_Store
|
.DS_Store
|
||||||
deepagents/
|
|
||||||
debug.log
|
debug.log
|
||||||
opencode/
|
|
||||||
|
references/
|
||||||
|
references
|
||||||
|
|
||||||
# Playwright (E2E test artifacts)
|
# Playwright (E2E test artifacts)
|
||||||
surfsense_web/playwright/.auth/
|
surfsense_web/playwright/.auth/
|
||||||
surfsense_web/playwright-report/
|
surfsense_web/playwright-report/
|
||||||
surfsense_web/test-results/
|
surfsense_web/test-results/
|
||||||
surfsense_web/blob-report/
|
surfsense_web/blob-report/
|
||||||
hermes-agent
|
|
||||||
hermes-agent/
|
|
||||||
|
|
||||||
content_research/
|
content_research/
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,18 @@
|
||||||
# - Backend .env: SEARXNG_DEFAULT_HOST=http://localhost:${SEARXNG_PORT:-8888}
|
# - Backend .env: SEARXNG_DEFAULT_HOST=http://localhost:${SEARXNG_PORT:-8888}
|
||||||
# - Backend .env: CELERY_BROKER_URL / REDIS_APP_URL → redis://localhost:6379/0
|
# - Backend .env: CELERY_BROKER_URL / REDIS_APP_URL → redis://localhost:6379/0
|
||||||
# - Web .env: NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:${ZERO_CACHE_PORT:-4848}
|
# - Web .env: NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:${ZERO_CACHE_PORT:-4848}
|
||||||
|
#
|
||||||
|
# IMPORTANT — schema migrations:
|
||||||
|
# This compose file does NOT build the backend image and therefore cannot
|
||||||
|
# run a `migrations` service. You MUST run alembic on the host before
|
||||||
|
# bringing zero-cache up, or zero-cache will crash-loop with
|
||||||
|
# `Unknown or invalid publications. Specified: [zero_publication]`.
|
||||||
|
#
|
||||||
|
# First-time / after-pull:
|
||||||
|
# cd surfsense_backend && uv run alembic upgrade head
|
||||||
|
#
|
||||||
|
# The other compose files (docker-compose.yml, docker-compose.dev.yml)
|
||||||
|
# handle this automatically via a dedicated `migrations` service.
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
name: surfsense-deps
|
name: surfsense-deps
|
||||||
|
|
@ -82,6 +94,10 @@ services:
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
# NOTE: zero-cache requires the `zero_publication` Postgres publication to
|
||||||
|
# exist before it starts. In this deps-only stack there is no backend
|
||||||
|
# container to run migrations, so you must run `uv run alembic upgrade head`
|
||||||
|
# from `surfsense_backend/` on the host BEFORE `docker compose up -d`.
|
||||||
zero-cache:
|
zero-cache:
|
||||||
image: rocicorp/zero:1.4.0
|
image: rocicorp/zero:1.4.0
|
||||||
ports:
|
ports:
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,25 @@ services:
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
# Short-lived schema runner; see docker/docker-compose.yml `migrations`
|
||||||
|
# service for the full rationale. Builds from the same backend context as
|
||||||
|
# the dev backend/celery services.
|
||||||
|
migrations:
|
||||||
|
build: *backend-build
|
||||||
|
env_file:
|
||||||
|
- ../surfsense_backend/.env
|
||||||
|
environment:
|
||||||
|
- DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
|
||||||
|
- PYTHONPATH=/app
|
||||||
|
- SERVICE_ROLE=migrate
|
||||||
|
- MIGRATION_TIMEOUT=${MIGRATION_TIMEOUT:-900}
|
||||||
|
volumes:
|
||||||
|
- zero_init:/zero-init
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
pgadmin:
|
pgadmin:
|
||||||
image: dpage/pgadmin4
|
image: dpage/pgadmin4
|
||||||
ports:
|
ports:
|
||||||
|
|
@ -111,8 +130,10 @@ services:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
searxng:
|
searxng:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
migrations:
|
||||||
|
condition: service_completed_successfully
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
|
||||||
interval: 15s
|
interval: 15s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 30
|
retries: 30
|
||||||
|
|
@ -141,6 +162,8 @@ services:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
migrations:
|
||||||
|
condition: service_completed_successfully
|
||||||
backend:
|
backend:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|
||||||
|
|
@ -160,6 +183,8 @@ services:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
migrations:
|
||||||
|
condition: service_completed_successfully
|
||||||
celery_worker:
|
celery_worker:
|
||||||
condition: service_started
|
condition: service_started
|
||||||
|
|
||||||
|
|
@ -185,8 +210,10 @@ services:
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
depends_on:
|
depends_on:
|
||||||
backend:
|
db:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
migrations:
|
||||||
|
condition: service_completed_successfully
|
||||||
environment:
|
environment:
|
||||||
- ZERO_UPSTREAM_DB=${ZERO_UPSTREAM_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
|
- ZERO_UPSTREAM_DB=${ZERO_UPSTREAM_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
|
||||||
- ZERO_CVR_DB=${ZERO_CVR_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
|
- ZERO_CVR_DB=${ZERO_CVR_DB:-postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}?sslmode=${DB_SSLMODE:-disable}}
|
||||||
|
|
@ -201,6 +228,12 @@ services:
|
||||||
- ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
|
- ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
|
||||||
volumes:
|
volumes:
|
||||||
- zero_cache_data:/data
|
- zero_cache_data:/data
|
||||||
|
- zero_init:/zero-init
|
||||||
|
# Wrapper: see docker/docker-compose.yml `zero-cache` for rationale.
|
||||||
|
entrypoint: ["sh", "-c"]
|
||||||
|
# Pass the script as a single list element so Compose does not tokenize it.
|
||||||
|
command:
|
||||||
|
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
|
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
|
||||||
|
|
@ -238,3 +271,5 @@ volumes:
|
||||||
name: surfsense-dev-shared-temp
|
name: surfsense-dev-shared-temp
|
||||||
zero_cache_data:
|
zero_cache_data:
|
||||||
name: surfsense-dev-zero-cache
|
name: surfsense-dev-zero-cache
|
||||||
|
zero_init:
|
||||||
|
name: surfsense-dev-zero-init
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,28 @@ services:
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
# Short-lived schema runner. Executes `alembic upgrade head` and verifies
|
||||||
|
# that the `zero_publication` Postgres logical-replication publication
|
||||||
|
# exists, then exits 0. Downstream services (backend, celery_*, zero-cache)
|
||||||
|
# gate on this with `condition: service_completed_successfully` so a failed
|
||||||
|
# migration halts the whole stack instead of silently producing a half-built
|
||||||
|
# system that crash-loops zero-cache on missing publications.
|
||||||
|
migrations:
|
||||||
|
image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: ${DATABASE_URL:-postgresql+asyncpg://${DB_USER:-surfsense}:${DB_PASSWORD:-surfsense}@${DB_HOST:-db}:${DB_PORT:-5432}/${DB_NAME:-surfsense}}
|
||||||
|
PYTHONPATH: /app
|
||||||
|
SERVICE_ROLE: migrate
|
||||||
|
MIGRATION_TIMEOUT: ${MIGRATION_TIMEOUT:-900}
|
||||||
|
volumes:
|
||||||
|
- zero_init:/zero-init
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
redis:
|
redis:
|
||||||
image: redis:8-alpine
|
image: redis:8-alpine
|
||||||
volumes:
|
volumes:
|
||||||
|
|
@ -88,9 +110,11 @@ services:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
searxng:
|
searxng:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
migrations:
|
||||||
|
condition: service_completed_successfully
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
test: ["CMD", "curl", "-f", "http://localhost:8000/ready"]
|
||||||
interval: 15s
|
interval: 15s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 30
|
retries: 30
|
||||||
|
|
@ -118,6 +142,8 @@ services:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
migrations:
|
||||||
|
condition: service_completed_successfully
|
||||||
backend:
|
backend:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
labels:
|
labels:
|
||||||
|
|
@ -140,6 +166,8 @@ services:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
migrations:
|
||||||
|
condition: service_completed_successfully
|
||||||
celery_worker:
|
celery_worker:
|
||||||
condition: service_started
|
condition: service_started
|
||||||
labels:
|
labels:
|
||||||
|
|
@ -182,10 +210,21 @@ services:
|
||||||
ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
|
ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
|
||||||
volumes:
|
volumes:
|
||||||
- zero_cache_data:/data
|
- zero_cache_data:/data
|
||||||
|
- zero_init:/zero-init
|
||||||
|
# Wrapper: if the migrations service flagged a publication change via
|
||||||
|
# /zero-init/needs_reset, wipe the SQLite replica before starting so
|
||||||
|
# zero-cache does a clean initial sync. Recovers from the half-built
|
||||||
|
# replica state (`_zero.tableMetadata` missing) caused by earlier crashes.
|
||||||
|
entrypoint: ["sh", "-c"]
|
||||||
|
# Pass the script as a single list element so Compose does not tokenize it.
|
||||||
|
command:
|
||||||
|
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
depends_on:
|
depends_on:
|
||||||
backend:
|
db:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
migrations:
|
||||||
|
condition: service_completed_successfully
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
|
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
|
||||||
interval: 10s
|
interval: 10s
|
||||||
|
|
@ -221,3 +260,5 @@ volumes:
|
||||||
name: surfsense-shared-temp
|
name: surfsense-shared-temp
|
||||||
zero_cache_data:
|
zero_cache_data:
|
||||||
name: surfsense-zero-cache
|
name: surfsense-zero-cache
|
||||||
|
zero_init:
|
||||||
|
name: surfsense-zero-init
|
||||||
|
|
|
||||||
|
|
@ -97,6 +97,161 @@ function Wait-ForPostgres {
|
||||||
Write-Ok "PostgreSQL is ready."
|
Write-Ok "PostgreSQL is ready."
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── Stack health helpers ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function Get-ComposeServices {
|
||||||
|
Push-Location $InstallDir
|
||||||
|
try {
|
||||||
|
$raw = Invoke-NativeSafe { docker compose ps -a --format json 2>$null }
|
||||||
|
} finally {
|
||||||
|
Pop-Location
|
||||||
|
}
|
||||||
|
if ([string]::IsNullOrWhiteSpace($raw)) { return @() }
|
||||||
|
|
||||||
|
# Compose v2.21+ emits a JSON array; older versions emit one object per line.
|
||||||
|
try {
|
||||||
|
$parsed = $raw | ConvertFrom-Json
|
||||||
|
if ($parsed -is [System.Collections.IEnumerable] -and -not ($parsed -is [string])) {
|
||||||
|
return @($parsed)
|
||||||
|
}
|
||||||
|
return @($parsed)
|
||||||
|
} catch {
|
||||||
|
$services = @()
|
||||||
|
foreach ($line in ($raw -split "`r?`n")) {
|
||||||
|
$line = $line.Trim()
|
||||||
|
if (-not $line) { continue }
|
||||||
|
try { $services += ($line | ConvertFrom-Json) } catch { }
|
||||||
|
}
|
||||||
|
return $services
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function Wait-StackHealthy {
|
||||||
|
param([int]$TimeoutSec = 300)
|
||||||
|
|
||||||
|
$deadline = (Get-Date).AddSeconds($TimeoutSec)
|
||||||
|
$lastReport = ""
|
||||||
|
|
||||||
|
while ((Get-Date) -lt $deadline) {
|
||||||
|
$services = Get-ComposeServices
|
||||||
|
if (-not $services -or $services.Count -eq 0) {
|
||||||
|
Start-Sleep -Seconds 3
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
$bad = @()
|
||||||
|
$waiting = @()
|
||||||
|
$good = @()
|
||||||
|
|
||||||
|
foreach ($svc in $services) {
|
||||||
|
$name = $svc.Service
|
||||||
|
$state = $svc.State
|
||||||
|
$health = if ($svc.PSObject.Properties.Name -contains 'Health') { $svc.Health } else { '' }
|
||||||
|
$exit = if ($svc.PSObject.Properties.Name -contains 'ExitCode') { $svc.ExitCode } else { $null }
|
||||||
|
|
||||||
|
if ($name -eq 'migrations') {
|
||||||
|
if ($state -eq 'exited' -and $exit -eq 0) { $good += $name }
|
||||||
|
elseif ($state -eq 'exited') { $bad += "${name} (exit=${exit})" }
|
||||||
|
else { $waiting += "${name} (${state})" }
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($state -eq 'running') {
|
||||||
|
if ([string]::IsNullOrEmpty($health) -or $health -eq 'healthy') {
|
||||||
|
$good += $name
|
||||||
|
} elseif ($health -eq 'starting') {
|
||||||
|
$waiting += "${name} (starting)"
|
||||||
|
} elseif ($health -eq 'unhealthy') {
|
||||||
|
$bad += "${name} (unhealthy)"
|
||||||
|
} else {
|
||||||
|
$waiting += "${name} (${health})"
|
||||||
|
}
|
||||||
|
} elseif ($state -eq 'restarting') {
|
||||||
|
$bad += "${name} (restarting)"
|
||||||
|
} elseif ($state -eq 'exited') {
|
||||||
|
$bad += "${name} (exited, code=${exit})"
|
||||||
|
} else {
|
||||||
|
$waiting += "${name} (${state})"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($bad.Count -gt 0) {
|
||||||
|
return @{ Ok = $false; Reason = 'failure'; Bad = $bad; Waiting = $waiting; Good = $good }
|
||||||
|
}
|
||||||
|
if ($waiting.Count -eq 0) {
|
||||||
|
return @{ Ok = $true; Reason = 'all_healthy'; Good = $good }
|
||||||
|
}
|
||||||
|
|
||||||
|
$report = "Waiting on: " + ($waiting -join ', ')
|
||||||
|
if ($report -ne $lastReport) {
|
||||||
|
Write-Info $report
|
||||||
|
$lastReport = $report
|
||||||
|
}
|
||||||
|
Start-Sleep -Seconds 5
|
||||||
|
}
|
||||||
|
|
||||||
|
return @{ Ok = $false; Reason = 'timeout'; Bad = $bad; Waiting = $waiting; Good = $good }
|
||||||
|
}
|
||||||
|
|
||||||
|
function Test-StaleZeroCacheVolume {
|
||||||
|
$raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null }
|
||||||
|
if ([string]::IsNullOrWhiteSpace($raw)) { return $false }
|
||||||
|
$names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ }
|
||||||
|
$hasZeroCache = $names -contains 'surfsense-zero-cache'
|
||||||
|
$hasZeroInit = $names -contains 'surfsense-zero-init'
|
||||||
|
# Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init.
|
||||||
|
# Such a volume may hold a half-initialized SQLite replica from an earlier
|
||||||
|
# crash-loop. Wiping it forces zero-cache to do a fresh initial sync.
|
||||||
|
return ($hasZeroCache -and -not $hasZeroInit)
|
||||||
|
}
|
||||||
|
|
||||||
|
function Invoke-StaleZeroCacheCleanup {
|
||||||
|
if (-not (Test-StaleZeroCacheVolume)) { return }
|
||||||
|
|
||||||
|
Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
|
||||||
|
Write-Warn "predates the migrations-service fix. It may contain a half-initialized"
|
||||||
|
Write-Warn "SQLite replica that would block zero-cache from starting."
|
||||||
|
Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
|
||||||
|
Start-Sleep -Seconds 5
|
||||||
|
|
||||||
|
Push-Location $InstallDir
|
||||||
|
Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null
|
||||||
|
Pop-Location
|
||||||
|
Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null
|
||||||
|
Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
|
||||||
|
}
|
||||||
|
|
||||||
|
function Write-Err-NoExit {
|
||||||
|
param([string]$Message)
|
||||||
|
Write-Host "[ERROR] $Message" -ForegroundColor Red
|
||||||
|
}
|
||||||
|
|
||||||
|
function Invoke-StackFailureReport {
|
||||||
|
param([hashtable]$Result)
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Err-NoExit "Stack did not reach a healthy state."
|
||||||
|
if ($Result.Bad.Count -gt 0) { Write-Host (" Failed: " + ($Result.Bad -join ', ')) }
|
||||||
|
if ($Result.Waiting.Count -gt 0) { Write-Host (" Stuck: " + ($Result.Waiting -join ', ')) }
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Info "Recent logs from migrations / zero-cache / backend:"
|
||||||
|
Push-Location $InstallDir
|
||||||
|
try {
|
||||||
|
Invoke-NativeSafe { docker compose logs --tail=60 migrations zero-cache backend 2>&1 } | Write-Host
|
||||||
|
} finally {
|
||||||
|
Pop-Location
|
||||||
|
}
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "Recovery hints:" -ForegroundColor Yellow
|
||||||
|
Write-Host " 1. Inspect migrations: cd $InstallDir; docker compose logs migrations"
|
||||||
|
Write-Host " 2. Verify publication: cd $InstallDir; docker compose exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
|
||||||
|
Write-Host " 3. Hard reset zero db: cd $InstallDir; docker compose down; docker volume rm surfsense-zero-cache; docker compose up -d"
|
||||||
|
Write-Host ""
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
# ── Download files ──────────────────────────────────────────────────────────
|
# ── Download files ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
Write-Step "Downloading SurfSense files"
|
Write-Step "Downloading SurfSense files"
|
||||||
|
|
@ -191,6 +346,8 @@ if (-not (Test-Path $envPath)) {
|
||||||
|
|
||||||
# ── Start containers ────────────────────────────────────────────────────────
|
# ── Start containers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
Invoke-StaleZeroCacheCleanup
|
||||||
|
|
||||||
if ($MigrationMode) {
|
if ($MigrationMode) {
|
||||||
$envContent = Get-Content $envPath
|
$envContent = Get-Content $envPath
|
||||||
$DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1
|
$DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1
|
||||||
|
|
@ -251,7 +408,13 @@ if ($MigrationMode) {
|
||||||
Push-Location $InstallDir
|
Push-Location $InstallDir
|
||||||
Invoke-NativeSafe { docker compose up -d }
|
Invoke-NativeSafe { docker compose up -d }
|
||||||
Pop-Location
|
Pop-Location
|
||||||
Write-Ok "All services started."
|
Write-Ok "All containers started; waiting for stack to become healthy..."
|
||||||
|
|
||||||
|
$waitResult = Wait-StackHealthy -TimeoutSec 300
|
||||||
|
if (-not $waitResult.Ok) {
|
||||||
|
Invoke-StackFailureReport -Result $waitResult
|
||||||
|
}
|
||||||
|
Write-Ok "All services healthy."
|
||||||
|
|
||||||
Remove-Item $KeyFile -ErrorAction SilentlyContinue
|
Remove-Item $KeyFile -ErrorAction SilentlyContinue
|
||||||
|
|
||||||
|
|
@ -260,7 +423,13 @@ if ($MigrationMode) {
|
||||||
Push-Location $InstallDir
|
Push-Location $InstallDir
|
||||||
Invoke-NativeSafe { docker compose up -d }
|
Invoke-NativeSafe { docker compose up -d }
|
||||||
Pop-Location
|
Pop-Location
|
||||||
Write-Ok "All services started."
|
Write-Ok "All containers started; waiting for stack to become healthy..."
|
||||||
|
|
||||||
|
$waitResult = Wait-StackHealthy -TimeoutSec 300
|
||||||
|
if (-not $waitResult.Ok) {
|
||||||
|
Invoke-StackFailureReport -Result $waitResult
|
||||||
|
}
|
||||||
|
Write-Ok "All services healthy."
|
||||||
}
|
}
|
||||||
|
|
||||||
# ── Watchtower (auto-update) ────────────────────────────────────────────────
|
# ── Watchtower (auto-update) ────────────────────────────────────────────────
|
||||||
|
|
|
||||||
|
|
@ -97,6 +97,163 @@ wait_for_pg() {
|
||||||
success "PostgreSQL is ready."
|
success "PostgreSQL is ready."
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── Stack health helpers ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Enumerate compose services for project `surfsense` as `service|state|health|exitcode`
|
||||||
|
# lines. Uses `docker inspect` so we don't depend on `jq`, `python3`, or the
|
||||||
|
# exact ordering of fields in `docker compose ps --format json` output.
|
||||||
|
get_compose_services() {
|
||||||
|
local containers
|
||||||
|
containers=$(docker ps -a --filter "label=com.docker.compose.project=surfsense" --format '{{.Names}}' 2>/dev/null) || true
|
||||||
|
[[ -z "$containers" ]] && return 0
|
||||||
|
|
||||||
|
while IFS= read -r container; do
|
||||||
|
[[ -z "$container" ]] && continue
|
||||||
|
local svc state health code
|
||||||
|
svc=$(docker inspect -f '{{index .Config.Labels "com.docker.compose.service"}}' "$container" 2>/dev/null || echo "")
|
||||||
|
state=$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
|
||||||
|
health=$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{end}}' "$container" 2>/dev/null || echo "")
|
||||||
|
code=$(docker inspect -f '{{.State.ExitCode}}' "$container" 2>/dev/null || echo "")
|
||||||
|
[[ -z "$svc" ]] && continue
|
||||||
|
printf '%s|%s|%s|%s\n' "$svc" "$state" "$health" "$code"
|
||||||
|
done <<< "$containers"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Globals populated by wait_stack_healthy / consumed by stack_failure_report.
|
||||||
|
STACK_BAD=()
|
||||||
|
STACK_WAITING=()
|
||||||
|
STACK_GOOD=()
|
||||||
|
STACK_TIMEOUT=false
|
||||||
|
|
||||||
|
wait_stack_healthy() {
|
||||||
|
local timeout_sec=${1:-300}
|
||||||
|
local deadline=$(($(date +%s) + timeout_sec))
|
||||||
|
local last_report=""
|
||||||
|
local bad=()
|
||||||
|
local waiting=()
|
||||||
|
local good=()
|
||||||
|
|
||||||
|
while [[ $(date +%s) -lt $deadline ]]; do
|
||||||
|
local lines
|
||||||
|
lines=$(get_compose_services)
|
||||||
|
if [[ -z "$lines" ]]; then
|
||||||
|
sleep 3
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
bad=()
|
||||||
|
waiting=()
|
||||||
|
good=()
|
||||||
|
|
||||||
|
while IFS='|' read -r name state health code; do
|
||||||
|
[[ -z "$name" ]] && continue
|
||||||
|
if [[ "$name" == "migrations" ]]; then
|
||||||
|
if [[ "$state" == "exited" && "$code" == "0" ]]; then
|
||||||
|
good+=("$name")
|
||||||
|
elif [[ "$state" == "exited" ]]; then
|
||||||
|
bad+=("${name} (exit=${code})")
|
||||||
|
else
|
||||||
|
waiting+=("${name} (${state})")
|
||||||
|
fi
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$state" == "running" ]]; then
|
||||||
|
if [[ -z "$health" || "$health" == "healthy" ]]; then
|
||||||
|
good+=("$name")
|
||||||
|
elif [[ "$health" == "starting" ]]; then
|
||||||
|
waiting+=("${name} (starting)")
|
||||||
|
elif [[ "$health" == "unhealthy" ]]; then
|
||||||
|
bad+=("${name} (unhealthy)")
|
||||||
|
else
|
||||||
|
waiting+=("${name} (${health})")
|
||||||
|
fi
|
||||||
|
elif [[ "$state" == "restarting" ]]; then
|
||||||
|
bad+=("${name} (restarting)")
|
||||||
|
elif [[ "$state" == "exited" ]]; then
|
||||||
|
bad+=("${name} (exited, code=${code})")
|
||||||
|
else
|
||||||
|
waiting+=("${name} (${state})")
|
||||||
|
fi
|
||||||
|
done <<< "$lines"
|
||||||
|
|
||||||
|
if (( ${#bad[@]} > 0 )); then
|
||||||
|
STACK_BAD=("${bad[@]}")
|
||||||
|
STACK_WAITING=("${waiting[@]}")
|
||||||
|
STACK_GOOD=("${good[@]}")
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
if (( ${#waiting[@]} == 0 )); then
|
||||||
|
STACK_GOOD=("${good[@]}")
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local report="Waiting on: ${waiting[*]}"
|
||||||
|
if [[ "$report" != "$last_report" ]]; then
|
||||||
|
info "$report"
|
||||||
|
last_report="$report"
|
||||||
|
fi
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
# bad/waiting/good are declared at function scope so referencing them is
|
||||||
|
# safe even if the polling loop never executed its body.
|
||||||
|
STACK_BAD=()
|
||||||
|
[[ ${#bad[@]} -gt 0 ]] && STACK_BAD=("${bad[@]}")
|
||||||
|
STACK_WAITING=()
|
||||||
|
[[ ${#waiting[@]} -gt 0 ]] && STACK_WAITING=("${waiting[@]}")
|
||||||
|
STACK_GOOD=()
|
||||||
|
[[ ${#good[@]} -gt 0 ]] && STACK_GOOD=("${good[@]}")
|
||||||
|
STACK_TIMEOUT=true
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stack_failure_report() {
|
||||||
|
echo ""
|
||||||
|
echo -e "\033[31m[ERROR]\033[0m Stack did not reach a healthy state."
|
||||||
|
if (( ${#STACK_BAD[@]} > 0 )) && [[ -n "${STACK_BAD[0]}" ]]; then
|
||||||
|
echo " Failed: ${STACK_BAD[*]}"
|
||||||
|
fi
|
||||||
|
if (( ${#STACK_WAITING[@]} > 0 )) && [[ -n "${STACK_WAITING[0]}" ]]; then
|
||||||
|
echo " Stuck: ${STACK_WAITING[*]}"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
info "Recent logs from migrations / zero-cache / backend:"
|
||||||
|
(cd "${INSTALL_DIR}" && ${DC} logs --tail=60 migrations zero-cache backend 2>&1) || true
|
||||||
|
echo ""
|
||||||
|
echo "Recovery hints:"
|
||||||
|
echo " 1. Inspect migrations: cd ${INSTALL_DIR} && ${DC} logs migrations"
|
||||||
|
echo " 2. Verify publication: cd ${INSTALL_DIR} && ${DC} exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'"
|
||||||
|
echo " 3. Hard reset zero db: cd ${INSTALL_DIR} && ${DC} down && docker volume rm surfsense-zero-cache && ${DC} up -d"
|
||||||
|
echo ""
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not.
|
||||||
|
# That signals an install that predates the migrations-service fix; the old
|
||||||
|
# replica may be half-initialized and would block zero-cache on next start.
|
||||||
|
test_stale_zero_cache_volume() {
|
||||||
|
local has_zc has_zi
|
||||||
|
has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true)
|
||||||
|
has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true)
|
||||||
|
[[ -n "$has_zc" && -z "$has_zi" ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
invoke_stale_zero_cache_cleanup() {
|
||||||
|
if ! test_stale_zero_cache_volume; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
|
||||||
|
warn "predates the migrations-service fix. It may contain a half-initialized"
|
||||||
|
warn "SQLite replica that would block zero-cache from starting."
|
||||||
|
warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
(cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true
|
||||||
|
docker volume rm surfsense-zero-cache 2>/dev/null || true
|
||||||
|
success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
|
||||||
|
}
|
||||||
|
|
||||||
# ── Download files ───────────────────────────────────────────────────────────
|
# ── Download files ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
step "Downloading SurfSense files"
|
step "Downloading SurfSense files"
|
||||||
|
|
@ -186,6 +343,8 @@ fi
|
||||||
|
|
||||||
# ── Start containers ─────────────────────────────────────────────────────────
|
# ── Start containers ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
invoke_stale_zero_cache_cleanup
|
||||||
|
|
||||||
if $MIGRATION_MODE; then
|
if $MIGRATION_MODE; then
|
||||||
# Read DB credentials from .env (fall back to defaults from docker-compose.yml)
|
# Read DB credentials from .env (fall back to defaults from docker-compose.yml)
|
||||||
DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true)
|
DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true)
|
||||||
|
|
@ -243,7 +402,12 @@ if $MIGRATION_MODE; then
|
||||||
|
|
||||||
step "Starting all SurfSense services"
|
step "Starting all SurfSense services"
|
||||||
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
|
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
|
||||||
success "All services started."
|
success "All containers started; waiting for stack to become healthy..."
|
||||||
|
|
||||||
|
if ! wait_stack_healthy 300; then
|
||||||
|
stack_failure_report
|
||||||
|
fi
|
||||||
|
success "All services healthy."
|
||||||
|
|
||||||
# Key file is no longer needed — SECRET_KEY is now in .env
|
# Key file is no longer needed — SECRET_KEY is now in .env
|
||||||
rm -f "${KEY_FILE}"
|
rm -f "${KEY_FILE}"
|
||||||
|
|
@ -251,7 +415,12 @@ if $MIGRATION_MODE; then
|
||||||
else
|
else
|
||||||
step "Starting SurfSense"
|
step "Starting SurfSense"
|
||||||
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
|
(cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null
|
||||||
success "All services started."
|
success "All containers started; waiting for stack to become healthy..."
|
||||||
|
|
||||||
|
if ! wait_stack_healthy 300; then
|
||||||
|
stack_failure_report
|
||||||
|
fi
|
||||||
|
success "All services healthy."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# ── Watchtower (auto-update) ─────────────────────────────────────────────────
|
# ── Watchtower (auto-update) ─────────────────────────────────────────────────
|
||||||
|
|
|
||||||
|
|
@ -167,10 +167,14 @@ COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
|
||||||
RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
|
RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
|
||||||
|
|
||||||
# SERVICE_ROLE controls which process this container runs:
|
# SERVICE_ROLE controls which process this container runs:
|
||||||
# api – FastAPI backend only (runs migrations on startup)
|
# migrate – Run alembic upgrade head, verify zero_publication exists, exit 0.
|
||||||
|
# Used by the dedicated `migrations` service in docker-compose.yml
|
||||||
|
# so downstream services gate on `service_completed_successfully`.
|
||||||
|
# api – FastAPI backend only (does NOT run migrations)
|
||||||
# worker – Celery worker only
|
# worker – Celery worker only
|
||||||
# beat – Celery beat scheduler only
|
# beat – Celery beat scheduler only
|
||||||
# all – All three (legacy / dev default)
|
# all – migrations + api + worker + beat (legacy / dev default;
|
||||||
|
# fails fast on migration error)
|
||||||
ENV SERVICE_ROLE=all
|
ENV SERVICE_ROLE=all
|
||||||
|
|
||||||
# Celery worker tuning (only used when SERVICE_ROLE=worker or all)
|
# Celery worker tuning (only used when SERVICE_ROLE=worker or all)
|
||||||
|
|
|
||||||
|
|
@ -945,6 +945,36 @@ async def health_check():
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/ready", tags=["health"])
|
||||||
|
@limiter.exempt
|
||||||
|
async def readiness_check():
|
||||||
|
"""Readiness probe.
|
||||||
|
|
||||||
|
Verifies that the schema state required by downstream services is
|
||||||
|
present. Specifically checks that the ``zero_publication`` Postgres
|
||||||
|
logical-replication publication exists; without it zero-cache crash-loops
|
||||||
|
on `Unknown or invalid publications`.
|
||||||
|
|
||||||
|
Returns 200 when ready, 503 otherwise. Used by the docker-compose
|
||||||
|
backend healthcheck and by ``install.ps1`` / ``install.sh`` post-up
|
||||||
|
verification.
|
||||||
|
"""
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from app.db import async_session_maker
|
||||||
|
|
||||||
|
async with async_session_maker() as session:
|
||||||
|
result = await session.execute(
|
||||||
|
text("SELECT 1 FROM pg_publication WHERE pubname = 'zero_publication'")
|
||||||
|
)
|
||||||
|
if result.first() is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=503,
|
||||||
|
detail="zero_publication missing; run alembic upgrade head",
|
||||||
|
)
|
||||||
|
return {"status": "ready"}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/verify-token")
|
@app.get("/verify-token")
|
||||||
async def authenticated_route(
|
async def authenticated_route(
|
||||||
user: User = Depends(current_active_user),
|
user: User = Depends(current_active_user),
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,15 @@ set -e
|
||||||
# ─────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────
|
||||||
# SERVICE_ROLE controls which process(es) this container runs.
|
# SERVICE_ROLE controls which process(es) this container runs.
|
||||||
#
|
#
|
||||||
# api – FastAPI backend only (runs migrations on startup)
|
# migrate – Run `alembic upgrade head`, verify zero_publication,
|
||||||
|
# then exit 0. Used by the dedicated `migrations` service
|
||||||
|
# in docker-compose.yml so downstream services can gate
|
||||||
|
# on `condition: service_completed_successfully`.
|
||||||
|
# api – FastAPI backend only (does NOT run migrations)
|
||||||
# worker – Celery worker only
|
# worker – Celery worker only
|
||||||
# beat – Celery beat scheduler only
|
# beat – Celery beat scheduler only
|
||||||
# all – All three in one container (legacy / dev default)
|
# all – migrations + api + worker + beat in one container
|
||||||
|
# (legacy / dev default; fails fast on migration error)
|
||||||
#
|
#
|
||||||
# Set SERVICE_ROLE as an environment variable in Coolify for
|
# Set SERVICE_ROLE as an environment variable in Coolify for
|
||||||
# each service deployment.
|
# each service deployment.
|
||||||
|
|
@ -41,7 +46,13 @@ cleanup() {
|
||||||
|
|
||||||
trap cleanup SIGTERM SIGINT
|
trap cleanup SIGTERM SIGINT
|
||||||
|
|
||||||
# ── Database migrations (only for api / all) ─────────────────
|
# ── Database migrations (only for migrate / all) ─────────────
|
||||||
|
# Fail-fast contract:
|
||||||
|
# - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s
|
||||||
|
# - zero_publication must exist in pg_publication afterwards
|
||||||
|
# Either failure exits non-zero so the dedicated `migrations` compose
|
||||||
|
# service exits non-zero, halting the rest of the stack instead of
|
||||||
|
# silently producing a half-built system that crash-loops zero-cache.
|
||||||
run_migrations() {
|
run_migrations() {
|
||||||
echo "Running database migrations..."
|
echo "Running database migrations..."
|
||||||
for i in {1..30}; do
|
for i in {1..30}; do
|
||||||
|
|
@ -53,11 +64,66 @@ run_migrations() {
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
|
|
||||||
if timeout 300 alembic upgrade head 2>&1; then
|
local timeout_secs="${MIGRATION_TIMEOUT:-900}"
|
||||||
echo "Migrations completed successfully."
|
echo "Running alembic upgrade head (timeout=${timeout_secs}s)..."
|
||||||
else
|
if ! timeout "${timeout_secs}" alembic upgrade head; then
|
||||||
echo "WARNING: Migration failed or timed out. Continuing anyway..."
|
echo "ERROR: alembic upgrade head failed (or exceeded ${timeout_secs}s timeout)." >&2
|
||||||
echo "You may need to run migrations manually: alembic upgrade head"
|
echo "Refusing to start. Inspect the error above and re-run." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Migrations completed successfully."
|
||||||
|
|
||||||
|
echo "Verifying zero_publication exists in Postgres..."
|
||||||
|
local pub_oid
|
||||||
|
pub_oid=$(python <<'PY' 2>/dev/null || true
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
from sqlalchemy import text
|
||||||
|
from app.db import engine
|
||||||
|
|
||||||
|
|
||||||
|
async def get_oid():
|
||||||
|
async with engine.connect() as conn:
|
||||||
|
result = await conn.execute(
|
||||||
|
text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'")
|
||||||
|
)
|
||||||
|
row = result.first()
|
||||||
|
if row is None:
|
||||||
|
sys.exit(1)
|
||||||
|
print(int(row[0]))
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(get_oid())
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
if [ -z "${pub_oid}" ]; then
|
||||||
|
echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2
|
||||||
|
echo "This usually means migration 116 (or a later publication migration) did not run." >&2
|
||||||
|
echo "Inspect alembic state with:" >&2
|
||||||
|
echo " docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "zero_publication verified (oid=${pub_oid})."
|
||||||
|
|
||||||
|
# Stale-replica safety net: if /zero-init is mounted (i.e. we are the
|
||||||
|
# dedicated `migrations` compose service), drop a marker file when the
|
||||||
|
# publication oid changed (or on first run) so the wrapped zero-cache
|
||||||
|
# entrypoint can wipe /data/zero.db before starting. This recovers from
|
||||||
|
# the case where a previous zero-cache crashed mid-init and left a
|
||||||
|
# half-built SQLite replica without a `_zero.tableMetadata` table.
|
||||||
|
if [ -d /zero-init ]; then
|
||||||
|
local stored_oid=""
|
||||||
|
[ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true)
|
||||||
|
if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then
|
||||||
|
echo "Publication oid changed (stored=${stored_oid:-<none>}, current=${pub_oid}); writing /zero-init/needs_reset."
|
||||||
|
: > /zero-init/needs_reset
|
||||||
|
chmod 666 /zero-init/needs_reset 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
echo "${pub_oid}" > /zero-init/last_pub_oid
|
||||||
|
chmod 666 /zero-init/last_pub_oid 2>/dev/null || true
|
||||||
|
# World-writable dir so the (possibly non-root) zero-cache container
|
||||||
|
# can `rm -f /zero-init/needs_reset` after acting on the marker.
|
||||||
|
chmod 777 /zero-init 2>/dev/null || true
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -102,8 +168,12 @@ start_beat() {
|
||||||
|
|
||||||
# ── Main: run based on role ──────────────────────────────────
|
# ── Main: run based on role ──────────────────────────────────
|
||||||
case "${SERVICE_ROLE}" in
|
case "${SERVICE_ROLE}" in
|
||||||
api)
|
migrate)
|
||||||
run_migrations
|
run_migrations
|
||||||
|
echo "Migrations complete; exiting cleanly."
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
api)
|
||||||
start_api
|
start_api
|
||||||
;;
|
;;
|
||||||
worker)
|
worker)
|
||||||
|
|
@ -121,7 +191,7 @@ case "${SERVICE_ROLE}" in
|
||||||
start_beat
|
start_beat
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: api, worker, beat, or all"
|
echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: migrate, api, worker, beat, or all"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@ Defaults work out of the box. Change `ZERO_ADMIN_PASSWORD` for security in produ
|
||||||
| `ZERO_UPSTREAM_DB` | PostgreSQL connection URL for replication (must be a direct connection, not via pgbouncer) | *(built from DB_* vars)* |
|
| `ZERO_UPSTREAM_DB` | PostgreSQL connection URL for replication (must be a direct connection, not via pgbouncer) | *(built from DB_* vars)* |
|
||||||
| `ZERO_CVR_DB` | PostgreSQL connection URL for client view records | *(built from DB_* vars)* |
|
| `ZERO_CVR_DB` | PostgreSQL connection URL for client view records | *(built from DB_* vars)* |
|
||||||
| `ZERO_CHANGE_DB` | PostgreSQL connection URL for replication log entries | *(built from DB_* vars)* |
|
| `ZERO_CHANGE_DB` | PostgreSQL connection URL for replication log entries | *(built from DB_* vars)* |
|
||||||
| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116) | `zero_publication` |
|
| `ZERO_APP_PUBLICATIONS` | PostgreSQL publication restricting which tables are replicated (created by migration 116, verified by the `migrations` service before `zero-cache` starts) | `zero_publication` |
|
||||||
| `ZERO_NUM_SYNC_WORKERS` | Number of view-sync worker processes. Must be ≤ connection pool sizes | `4` |
|
| `ZERO_NUM_SYNC_WORKERS` | Number of view-sync worker processes. Must be ≤ connection pool sizes | `4` |
|
||||||
| `ZERO_UPSTREAM_MAX_CONNS` | Max connections to upstream PostgreSQL for mutations | `20` |
|
| `ZERO_UPSTREAM_MAX_CONNS` | Max connections to upstream PostgreSQL for mutations | `20` |
|
||||||
| `ZERO_CVR_MAX_CONNS` | Max connections to the CVR database | `30` |
|
| `ZERO_CVR_MAX_CONNS` | Max connections to the CVR database | `30` |
|
||||||
|
|
@ -150,7 +150,9 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http
|
||||||
| Service | Description |
|
| Service | Description |
|
||||||
|---------|-------------|
|
|---------|-------------|
|
||||||
| `db` | PostgreSQL with pgvector extension |
|
| `db` | PostgreSQL with pgvector extension |
|
||||||
|
| `migrations` | Short-lived: runs `alembic upgrade head` and verifies `zero_publication`, then exits |
|
||||||
| `redis` | Message broker for Celery |
|
| `redis` | Message broker for Celery |
|
||||||
|
| `searxng` | Local privacy-respecting search backend |
|
||||||
| `backend` | FastAPI application server |
|
| `backend` | FastAPI application server |
|
||||||
| `celery_worker` | Background task processing (document indexing, etc.) |
|
| `celery_worker` | Background task processing (document indexing, etc.) |
|
||||||
| `celery_beat` | Periodic task scheduler (connector sync) |
|
| `celery_beat` | Periodic task scheduler (connector sync) |
|
||||||
|
|
@ -159,7 +161,42 @@ Uncomment the connectors you want to use. Redirect URIs follow the pattern `http
|
||||||
|
|
||||||
All services start automatically with `docker compose up -d`.
|
All services start automatically with `docker compose up -d`.
|
||||||
|
|
||||||
The backend includes a health check. Dependent services (workers, frontend) wait until the API is fully ready before starting. You can monitor startup progress with `docker compose ps` (look for `(health: starting)` → `(healthy)`).
|
### How startup ordering works
|
||||||
|
|
||||||
|
Schema migrations run as a dedicated `migrations` service that exits 0 on
|
||||||
|
success and non-zero on failure. Every other backend-image service gates on
|
||||||
|
it via `condition: service_completed_successfully`:
|
||||||
|
|
||||||
|
```text
|
||||||
|
db (healthy) ──▶ migrations (alembic upgrade head + verify zero_publication)
|
||||||
|
│
|
||||||
|
├── exit 0 ─▶ backend ──▶ frontend
|
||||||
|
│ celery_worker
|
||||||
|
│ celery_beat
|
||||||
|
│ zero-cache ──▶ frontend
|
||||||
|
│
|
||||||
|
└── exit ≠ 0 ─▶ compose halts the rest of the stack
|
||||||
|
```
|
||||||
|
|
||||||
|
This guarantees `zero-cache` only starts after `zero_publication` exists in
|
||||||
|
Postgres. Before this design, a silent migration failure would leave
|
||||||
|
`zero-cache` crash-looping with `Unknown or invalid publications. Specified:
|
||||||
|
[zero_publication]. Found: []`.
|
||||||
|
|
||||||
|
### Readiness vs liveness
|
||||||
|
|
||||||
|
The backend exposes two endpoints:
|
||||||
|
|
||||||
|
- `GET /health` — lightweight liveness probe (always returns 200 if the
|
||||||
|
process is up).
|
||||||
|
- `GET /ready` — readiness probe that confirms `zero_publication` exists.
|
||||||
|
Returns 503 if not. The compose `backend.healthcheck` uses `/ready` so the
|
||||||
|
container only reports `healthy` once the schema is actually usable by
|
||||||
|
zero-cache.
|
||||||
|
|
||||||
|
You can also monitor startup progress with `docker compose ps` (look for
|
||||||
|
`(health: starting)` → `(healthy)`). The install script polls these states
|
||||||
|
automatically and times out after 5 minutes if the stack does not converge.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -188,6 +225,90 @@ docker compose down -v
|
||||||
|
|
||||||
- **Ports already in use**: Change the relevant `*_PORT` variable in `.env` and restart.
|
- **Ports already in use**: Change the relevant `*_PORT` variable in `.env` and restart.
|
||||||
- **Permission errors on Linux**: You may need to prefix `docker` commands with `sudo`.
|
- **Permission errors on Linux**: You may need to prefix `docker` commands with `sudo`.
|
||||||
- **Zero-cache not starting**: Check `docker compose logs zero-cache`. Ensure PostgreSQL has `wal_level=logical` (configured automatically by the bundled `postgresql.conf`).
|
|
||||||
- **Real-time updates not working**: Open DevTools → Console and check for WebSocket errors. Verify `NEXT_PUBLIC_ZERO_CACHE_URL` matches the running zero-cache address.
|
- **Real-time updates not working**: Open DevTools → Console and check for WebSocket errors. Verify `NEXT_PUBLIC_ZERO_CACHE_URL` matches the running zero-cache address.
|
||||||
- **Line ending issues on Windows**: Run `git config --global core.autocrlf true` before cloning.
|
- **Line ending issues on Windows**: Run `git config --global core.autocrlf true` before cloning.
|
||||||
|
|
||||||
|
### Migration service exited non-zero
|
||||||
|
|
||||||
|
The `migrations` service exits non-zero in two cases:
|
||||||
|
|
||||||
|
1. `alembic upgrade head` failed (timeout or SQL error).
|
||||||
|
2. `alembic` succeeded but `zero_publication` is still missing from
|
||||||
|
`pg_publication`.
|
||||||
|
|
||||||
|
Inspect the logs and the alembic state:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose logs migrations
|
||||||
|
docker compose exec db psql -U surfsense -d surfsense \
|
||||||
|
-c 'SELECT * FROM alembic_version;'
|
||||||
|
docker compose exec db psql -U surfsense -d surfsense \
|
||||||
|
-c 'SELECT pubname FROM pg_publication;'
|
||||||
|
```
|
||||||
|
|
||||||
|
The default migration timeout is 900 seconds. Slow disks (Windows / WSL2)
|
||||||
|
may need more — set `MIGRATION_TIMEOUT` in `.env` to increase it.
|
||||||
|
|
||||||
|
### Zero-cache stuck on `Unknown or invalid publications`
|
||||||
|
|
||||||
|
Symptom (in `docker compose logs zero-cache`):
|
||||||
|
|
||||||
|
```text
|
||||||
|
Error: Unknown or invalid publications. Specified: [zero_publication]. Found: []
|
||||||
|
```
|
||||||
|
|
||||||
|
This means `zero-cache` started before `zero_publication` was created. With
|
||||||
|
the current compose files this should be impossible — the `migrations`
|
||||||
|
service blocks `zero-cache` from starting. If you see it, your stack
|
||||||
|
predates the fix or you brought up `zero-cache` manually with `docker
|
||||||
|
compose up zero-cache` before the migrations service ran.
|
||||||
|
|
||||||
|
Recovery:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose down
|
||||||
|
docker volume rm surfsense-zero-cache # wipe half-built SQLite replica
|
||||||
|
docker compose up -d # migrations runs first, then zero-cache
|
||||||
|
```
|
||||||
|
|
||||||
|
The install script (`install.ps1` / `install.sh`) detects this case
|
||||||
|
automatically: if it finds a `surfsense-zero-cache` volume from a previous
|
||||||
|
install with no matching `surfsense-zero-init` volume, it removes the stale
|
||||||
|
volume before bringing the stack up.
|
||||||
|
|
||||||
|
### Zero-cache crashes with `_zero.tableMetadata` errors
|
||||||
|
|
||||||
|
This indicates a half-initialized SQLite replica left behind by a previous
|
||||||
|
crash. The `migrations` service writes a marker file on a shared volume
|
||||||
|
(`surfsense-zero-init`) when the publication oid changes; zero-cache wipes
|
||||||
|
its replica and re-syncs on next start. If the marker mechanism somehow did
|
||||||
|
not trigger, run the recovery one-liner above.
|
||||||
|
|
||||||
|
### Ensuring `wal_level = logical`
|
||||||
|
|
||||||
|
Logical replication is required by zero-cache. The bundled
|
||||||
|
`docker/postgresql.conf` sets `wal_level = logical` automatically. If you
|
||||||
|
swap in your own config or use a managed Postgres, confirm with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec db psql -U surfsense -d surfsense \
|
||||||
|
-c "SHOW wal_level;"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using `docker-compose.deps-only.yml`
|
||||||
|
|
||||||
|
`docker-compose.deps-only.yml` runs only the dependencies (Postgres, Redis,
|
||||||
|
SearXNG, zero-cache) on Docker while the backend and frontend run on the
|
||||||
|
host. Because there is no backend container in this stack, there is no
|
||||||
|
`migrations` service either, and you must run alembic on the host **before**
|
||||||
|
bringing the stack up:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd surfsense_backend
|
||||||
|
uv run alembic upgrade head
|
||||||
|
cd ../docker
|
||||||
|
docker compose -f docker-compose.deps-only.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
If you skip the alembic step, `zero-cache` will crash-loop with `Unknown or
|
||||||
|
invalid publications. Specified: [zero_publication]`.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue