feat(docker): add ZERO_AUTO_RESET configuration for improved replication safety

- Introduced the ZERO_AUTO_RESET environment variable to enable automatic reset of the SQLite replica in case of replication halts.
- Updated Docker Compose files to include ZERO_AUTO_RESET in service configurations.
- Enhanced documentation to clarify the purpose and usage of the new variable.
This commit is contained in:
Anish Sarkar 2026-06-06 14:21:14 +05:30
parent 19fabaf011
commit 4e00f24a03
12 changed files with 304 additions and 151 deletions

View file

@ -102,6 +102,10 @@ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
# Only change this if you manage publications manually.
# ZERO_APP_PUBLICATIONS=zero_publication
# Keep Zero's documented halt safety net enabled. If replication halts, Zero
# can wipe and re-sync its local SQLite replica without touching Postgres.
# ZERO_AUTO_RESET=true
# Sync worker tuning. zero-cache defaults ZERO_NUM_SYNC_WORKERS to the number
# of CPU cores, which can exceed the connection pool limits on high-core machines.
# Each sync worker needs at least 1 connection from both the UPSTREAM and CVR

View file

@ -114,6 +114,7 @@ services:
- ZERO_REPLICA_FILE=/data/zero.db
- ZERO_ADMIN_PASSWORD=${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin}
- ZERO_APP_PUBLICATIONS=${ZERO_APP_PUBLICATIONS:-zero_publication}
- ZERO_AUTO_RESET=${ZERO_AUTO_RESET:-true}
- ZERO_NUM_SYNC_WORKERS=${ZERO_NUM_SYNC_WORKERS:-4}
- ZERO_UPSTREAM_MAX_CONNS=${ZERO_UPSTREAM_MAX_CONNS:-20}
- ZERO_CVR_MAX_CONNS=${ZERO_CVR_MAX_CONNS:-30}
@ -122,11 +123,13 @@ services:
volumes:
- zero_cache_data:/data
restart: unless-stopped
stop_grace_period: 300s
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
interval: 10s
timeout: 5s
retries: 5
start_period: 600s
# OPTIONAL — Azurite emulates Azure Blob Storage for testing the Azure
# original-file backend. The default filesystem backend needs none of this.

View file

@ -46,8 +46,6 @@ services:
- PYTHONPATH=/app
- SERVICE_ROLE=migrate
- MIGRATION_TIMEOUT=${MIGRATION_TIMEOUT:-900}
volumes:
- zero_init:/zero-init
depends_on:
db:
condition: service_healthy
@ -235,6 +233,7 @@ services:
- ZERO_REPLICA_FILE=/data/zero.db
- ZERO_ADMIN_PASSWORD=${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin}
- ZERO_APP_PUBLICATIONS=${ZERO_APP_PUBLICATIONS:-zero_publication}
- ZERO_AUTO_RESET=${ZERO_AUTO_RESET:-true}
- ZERO_NUM_SYNC_WORKERS=${ZERO_NUM_SYNC_WORKERS:-4}
- ZERO_UPSTREAM_MAX_CONNS=${ZERO_UPSTREAM_MAX_CONNS:-20}
- ZERO_CVR_MAX_CONNS=${ZERO_CVR_MAX_CONNS:-30}
@ -242,18 +241,14 @@ services:
- ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
volumes:
- zero_cache_data:/data
- zero_init:/zero-init
# Wrapper: see docker/docker-compose.yml `zero-cache` for rationale.
entrypoint: ["sh", "-c"]
# Pass the script as a single list element so Compose does not tokenize it.
command:
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
restart: unless-stopped
stop_grace_period: 300s
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
interval: 10s
timeout: 5s
retries: 5
start_period: 600s
frontend:
build:
@ -285,7 +280,5 @@ volumes:
name: surfsense-dev-shared-temp
zero_cache_data:
name: surfsense-dev-zero-cache
zero_init:
name: surfsense-dev-zero-init
whatsapp_sessions:
name: surfsense-dev-whatsapp-sessions

View file

@ -29,10 +29,9 @@ services:
# Short-lived schema runner. Executes `alembic upgrade head` and verifies
# that the `zero_publication` Postgres logical-replication publication
# exists, then exits 0. Downstream services (backend, celery_*, zero-cache)
# gate on this with `condition: service_completed_successfully` so a failed
# migration halts the whole stack instead of silently producing a half-built
# system that crash-loops zero-cache on missing publications.
# matches the canonical shape, then exits 0. Downstream services gate on this
# with `condition: service_completed_successfully` so a failed migration halts
# the whole stack instead of booting zero-cache against a drifted publication.
migrations:
image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}${SURFSENSE_VARIANT:+-${SURFSENSE_VARIANT}}
env_file:
@ -42,8 +41,6 @@ services:
PYTHONPATH: /app
SERVICE_ROLE: migrate
MIGRATION_TIMEOUT: ${MIGRATION_TIMEOUT:-900}
volumes:
- zero_init:/zero-init
depends_on:
db:
condition: service_healthy
@ -231,6 +228,7 @@ services:
ZERO_REPLICA_FILE: /data/zero.db
ZERO_ADMIN_PASSWORD: ${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin}
ZERO_APP_PUBLICATIONS: ${ZERO_APP_PUBLICATIONS:-zero_publication}
ZERO_AUTO_RESET: ${ZERO_AUTO_RESET:-true}
ZERO_NUM_SYNC_WORKERS: ${ZERO_NUM_SYNC_WORKERS:-4}
ZERO_UPSTREAM_MAX_CONNS: ${ZERO_UPSTREAM_MAX_CONNS:-20}
ZERO_CVR_MAX_CONNS: ${ZERO_CVR_MAX_CONNS:-30}
@ -238,16 +236,8 @@ services:
ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
volumes:
- zero_cache_data:/data
- zero_init:/zero-init
# Wrapper: if the migrations service flagged a publication change via
# /zero-init/needs_reset, wipe the SQLite replica before starting so
# zero-cache does a clean initial sync. Recovers from the half-built
# replica state (`_zero.tableMetadata` missing) caused by earlier crashes.
entrypoint: ["sh", "-c"]
# Pass the script as a single list element so Compose does not tokenize it.
command:
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
restart: unless-stopped
stop_grace_period: 300s
depends_on:
db:
condition: service_healthy
@ -258,6 +248,7 @@ services:
interval: 10s
timeout: 5s
retries: 5
start_period: 600s
frontend:
image: ghcr.io/modsetter/surfsense-web:${SURFSENSE_VERSION:-latest}
@ -289,7 +280,5 @@ volumes:
name: surfsense-shared-temp
zero_cache_data:
name: surfsense-zero-cache
zero_init:
name: surfsense-zero-init
whatsapp_sessions:
name: surfsense-whatsapp-sessions

View file

@ -153,34 +153,6 @@ function Wait-ForPostgres {
# ── Stack startup helper ────────────────────────────────────────────────────
function Test-StaleZeroCacheVolume {
$raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null }
if ([string]::IsNullOrWhiteSpace($raw)) { return $false }
$names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ }
$hasZeroCache = $names -contains 'surfsense-zero-cache'
$hasZeroInit = $names -contains 'surfsense-zero-init'
# Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init.
# Such a volume may hold a half-initialized SQLite replica from an earlier
# crash-loop. Wiping it forces zero-cache to do a fresh initial sync.
return ($hasZeroCache -and -not $hasZeroInit)
}
function Invoke-StaleZeroCacheCleanup {
if (-not (Test-StaleZeroCacheVolume)) { return }
Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
Write-Warn "predates the migrations-service fix. It may contain a half-initialized"
Write-Warn "SQLite replica that would block zero-cache from starting."
Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
Start-Sleep -Seconds 5
Push-Location $InstallDir
Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null
Pop-Location
Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null
Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
}
function Invoke-StackFailureReport {
Write-Host ""
Write-Host "[ERROR] Stack did not reach a healthy state." -ForegroundColor Red
@ -443,8 +415,6 @@ if (-not (Test-Path $envPath)) {
# ── Start containers ────────────────────────────────────────────────────────
Invoke-StaleZeroCacheCleanup
if ($MigrationMode) {
$envContent = Get-Content $envPath
$DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1

View file

@ -189,31 +189,6 @@ compose_up_wait() {
fi
}
# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not.
# That signals an install that predates the migrations-service fix; the old
# replica may be half-initialized and would block zero-cache on next start.
test_stale_zero_cache_volume() {
local has_zc has_zi
has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true)
has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true)
[[ -n "$has_zc" && -z "$has_zi" ]]
}
invoke_stale_zero_cache_cleanup() {
if ! test_stale_zero_cache_volume; then
return 0
fi
warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
warn "predates the migrations-service fix. It may contain a half-initialized"
warn "SQLite replica that would block zero-cache from starting."
warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
sleep 5
(cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true
docker volume rm surfsense-zero-cache 2>/dev/null || true
success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
}
# ── Variant and .env helpers ─────────────────────────────────────────────────
set_env_value() {
@ -448,8 +423,6 @@ fi
# ── Start containers ─────────────────────────────────────────────────────────
invoke_stale_zero_cache_cleanup
if $MIGRATION_MODE; then
# Read DB credentials from .env (fall back to defaults from docker-compose.yml)
DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true)