From 4e00f24a031453d6b4030c94d9c787fa7de2f2e1 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 6 Jun 2026 14:21:14 +0530 Subject: [PATCH] feat(docker): add ZERO_AUTO_RESET configuration for improved replication safety - Introduced the ZERO_AUTO_RESET environment variable to enable automatic reset of the SQLite replica in case of replication halts. - Updated Docker Compose files to include ZERO_AUTO_RESET in service configurations. - Enhanced documentation to clarify the purpose and usage of the new variable. --- docker/.env.example | 4 + docker/docker-compose.deps-only.yml | 3 + docker/docker-compose.dev.yml | 13 +- docker/docker-compose.yml | 23 +- docker/scripts/install.ps1 | 30 --- docker/scripts/install.sh | 27 --- surfsense_backend/alembic/env.py | 22 +- .../143_force_zero_publication_resync.py | 1 - .../155_reconcile_zero_publication.py | 23 ++ surfsense_backend/app/zero_publication.py | 229 ++++++++++++++++++ .../scripts/docker/entrypoint.sh | 55 +---- .../docker-installation/docker-compose.mdx | 25 +- 12 files changed, 304 insertions(+), 151 deletions(-) create mode 100644 surfsense_backend/alembic/versions/155_reconcile_zero_publication.py create mode 100644 surfsense_backend/app/zero_publication.py diff --git a/docker/.env.example b/docker/.env.example index 5f0f3c018..ba81543d3 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -102,6 +102,10 @@ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 # Only change this if you manage publications manually. # ZERO_APP_PUBLICATIONS=zero_publication +# Keep Zero's documented halt safety net enabled. If replication halts, Zero +# can wipe and re-sync its local SQLite replica without touching Postgres. +# ZERO_AUTO_RESET=true + # Sync worker tuning. zero-cache defaults ZERO_NUM_SYNC_WORKERS to the number # of CPU cores, which can exceed the connection pool limits on high-core machines. # Each sync worker needs at least 1 connection from both the UPSTREAM and CVR diff --git a/docker/docker-compose.deps-only.yml b/docker/docker-compose.deps-only.yml index 528f6df0b..ad4cc3127 100644 --- a/docker/docker-compose.deps-only.yml +++ b/docker/docker-compose.deps-only.yml @@ -114,6 +114,7 @@ services: - ZERO_REPLICA_FILE=/data/zero.db - ZERO_ADMIN_PASSWORD=${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin} - ZERO_APP_PUBLICATIONS=${ZERO_APP_PUBLICATIONS:-zero_publication} + - ZERO_AUTO_RESET=${ZERO_AUTO_RESET:-true} - ZERO_NUM_SYNC_WORKERS=${ZERO_NUM_SYNC_WORKERS:-4} - ZERO_UPSTREAM_MAX_CONNS=${ZERO_UPSTREAM_MAX_CONNS:-20} - ZERO_CVR_MAX_CONNS=${ZERO_CVR_MAX_CONNS:-30} @@ -122,11 +123,13 @@ services: volumes: - zero_cache_data:/data restart: unless-stopped + stop_grace_period: 300s healthcheck: test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"] interval: 10s timeout: 5s retries: 5 + start_period: 600s # OPTIONAL — Azurite emulates Azure Blob Storage for testing the Azure # original-file backend. The default filesystem backend needs none of this. diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 2e19d0791..35effefc0 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -46,8 +46,6 @@ services: - PYTHONPATH=/app - SERVICE_ROLE=migrate - MIGRATION_TIMEOUT=${MIGRATION_TIMEOUT:-900} - volumes: - - zero_init:/zero-init depends_on: db: condition: service_healthy @@ -235,6 +233,7 @@ services: - ZERO_REPLICA_FILE=/data/zero.db - ZERO_ADMIN_PASSWORD=${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin} - ZERO_APP_PUBLICATIONS=${ZERO_APP_PUBLICATIONS:-zero_publication} + - ZERO_AUTO_RESET=${ZERO_AUTO_RESET:-true} - ZERO_NUM_SYNC_WORKERS=${ZERO_NUM_SYNC_WORKERS:-4} - ZERO_UPSTREAM_MAX_CONNS=${ZERO_UPSTREAM_MAX_CONNS:-20} - ZERO_CVR_MAX_CONNS=${ZERO_CVR_MAX_CONNS:-30} @@ -242,18 +241,14 @@ services: - ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate} volumes: - zero_cache_data:/data - - zero_init:/zero-init - # Wrapper: see docker/docker-compose.yml `zero-cache` for rationale. - entrypoint: ["sh", "-c"] - # Pass the script as a single list element so Compose does not tokenize it. - command: - - 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache' restart: unless-stopped + stop_grace_period: 300s healthcheck: test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"] interval: 10s timeout: 5s retries: 5 + start_period: 600s frontend: build: @@ -285,7 +280,5 @@ volumes: name: surfsense-dev-shared-temp zero_cache_data: name: surfsense-dev-zero-cache - zero_init: - name: surfsense-dev-zero-init whatsapp_sessions: name: surfsense-dev-whatsapp-sessions diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 93dc10ebe..9bbf28ffd 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -29,10 +29,9 @@ services: # Short-lived schema runner. Executes `alembic upgrade head` and verifies # that the `zero_publication` Postgres logical-replication publication - # exists, then exits 0. Downstream services (backend, celery_*, zero-cache) - # gate on this with `condition: service_completed_successfully` so a failed - # migration halts the whole stack instead of silently producing a half-built - # system that crash-loops zero-cache on missing publications. + # matches the canonical shape, then exits 0. Downstream services gate on this + # with `condition: service_completed_successfully` so a failed migration halts + # the whole stack instead of booting zero-cache against a drifted publication. migrations: image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}${SURFSENSE_VARIANT:+-${SURFSENSE_VARIANT}} env_file: @@ -42,8 +41,6 @@ services: PYTHONPATH: /app SERVICE_ROLE: migrate MIGRATION_TIMEOUT: ${MIGRATION_TIMEOUT:-900} - volumes: - - zero_init:/zero-init depends_on: db: condition: service_healthy @@ -231,6 +228,7 @@ services: ZERO_REPLICA_FILE: /data/zero.db ZERO_ADMIN_PASSWORD: ${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin} ZERO_APP_PUBLICATIONS: ${ZERO_APP_PUBLICATIONS:-zero_publication} + ZERO_AUTO_RESET: ${ZERO_AUTO_RESET:-true} ZERO_NUM_SYNC_WORKERS: ${ZERO_NUM_SYNC_WORKERS:-4} ZERO_UPSTREAM_MAX_CONNS: ${ZERO_UPSTREAM_MAX_CONNS:-20} ZERO_CVR_MAX_CONNS: ${ZERO_CVR_MAX_CONNS:-30} @@ -238,16 +236,8 @@ services: ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate} volumes: - zero_cache_data:/data - - zero_init:/zero-init - # Wrapper: if the migrations service flagged a publication change via - # /zero-init/needs_reset, wipe the SQLite replica before starting so - # zero-cache does a clean initial sync. Recovers from the half-built - # replica state (`_zero.tableMetadata` missing) caused by earlier crashes. - entrypoint: ["sh", "-c"] - # Pass the script as a single list element so Compose does not tokenize it. - command: - - 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache' restart: unless-stopped + stop_grace_period: 300s depends_on: db: condition: service_healthy @@ -258,6 +248,7 @@ services: interval: 10s timeout: 5s retries: 5 + start_period: 600s frontend: image: ghcr.io/modsetter/surfsense-web:${SURFSENSE_VERSION:-latest} @@ -289,7 +280,5 @@ volumes: name: surfsense-shared-temp zero_cache_data: name: surfsense-zero-cache - zero_init: - name: surfsense-zero-init whatsapp_sessions: name: surfsense-whatsapp-sessions diff --git a/docker/scripts/install.ps1 b/docker/scripts/install.ps1 index 23812b2e8..6e973a520 100644 --- a/docker/scripts/install.ps1 +++ b/docker/scripts/install.ps1 @@ -153,34 +153,6 @@ function Wait-ForPostgres { # ── Stack startup helper ──────────────────────────────────────────────────── -function Test-StaleZeroCacheVolume { - $raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null } - if ([string]::IsNullOrWhiteSpace($raw)) { return $false } - $names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ } - $hasZeroCache = $names -contains 'surfsense-zero-cache' - $hasZeroInit = $names -contains 'surfsense-zero-init' - # Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init. - # Such a volume may hold a half-initialized SQLite replica from an earlier - # crash-loop. Wiping it forces zero-cache to do a fresh initial sync. - return ($hasZeroCache -and -not $hasZeroInit) -} - -function Invoke-StaleZeroCacheCleanup { - if (-not (Test-StaleZeroCacheVolume)) { return } - - Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that" - Write-Warn "predates the migrations-service fix. It may contain a half-initialized" - Write-Warn "SQLite replica that would block zero-cache from starting." - Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel." - Start-Sleep -Seconds 5 - - Push-Location $InstallDir - Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null - Pop-Location - Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null - Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start." -} - function Invoke-StackFailureReport { Write-Host "" Write-Host "[ERROR] Stack did not reach a healthy state." -ForegroundColor Red @@ -443,8 +415,6 @@ if (-not (Test-Path $envPath)) { # ── Start containers ──────────────────────────────────────────────────────── -Invoke-StaleZeroCacheCleanup - if ($MigrationMode) { $envContent = Get-Content $envPath $DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1 diff --git a/docker/scripts/install.sh b/docker/scripts/install.sh index d21d38d79..4df15fbd0 100644 --- a/docker/scripts/install.sh +++ b/docker/scripts/install.sh @@ -189,31 +189,6 @@ compose_up_wait() { fi } -# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not. -# That signals an install that predates the migrations-service fix; the old -# replica may be half-initialized and would block zero-cache on next start. -test_stale_zero_cache_volume() { - local has_zc has_zi - has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true) - has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true) - [[ -n "$has_zc" && -z "$has_zi" ]] -} - -invoke_stale_zero_cache_cleanup() { - if ! test_stale_zero_cache_volume; then - return 0 - fi - warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that" - warn "predates the migrations-service fix. It may contain a half-initialized" - warn "SQLite replica that would block zero-cache from starting." - warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel." - sleep 5 - - (cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true - docker volume rm surfsense-zero-cache 2>/dev/null || true - success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start." -} - # ── Variant and .env helpers ───────────────────────────────────────────────── set_env_value() { @@ -448,8 +423,6 @@ fi # ── Start containers ───────────────────────────────────────────────────────── -invoke_stale_zero_cache_cleanup - if $MIGRATION_MODE; then # Read DB credentials from .env (fall back to defaults from docker-compose.yml) DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true) diff --git a/surfsense_backend/alembic/env.py b/surfsense_backend/alembic/env.py index 5354211aa..04a6b50ff 100644 --- a/surfsense_backend/alembic/env.py +++ b/surfsense_backend/alembic/env.py @@ -3,6 +3,7 @@ import os import sys from logging.config import fileConfig +import sqlalchemy as sa from sqlalchemy import pool from sqlalchemy.engine import Connection from sqlalchemy.ext.asyncio import async_engine_from_config @@ -36,6 +37,9 @@ if config.config_file_name is not None: # target_metadata = mymodel.Base.metadata target_metadata = Base.metadata +MIGRATION_ADVISORY_LOCK_NAMESPACE = "surfsense" +MIGRATION_ADVISORY_LOCK_NAME = "alembic_migrations" + # other values from the config, defined by the needs of env.py, # can be acquired: # my_important_option = config.get_main_option("my_important_option") @@ -73,8 +77,22 @@ def do_run_migrations(connection: Connection) -> None: transaction_per_migration=True, ) - with context.begin_transaction(): - context.run_migrations() + lock_params = { + "namespace": MIGRATION_ADVISORY_LOCK_NAMESPACE, + "name": MIGRATION_ADVISORY_LOCK_NAME, + } + connection.execute( + sa.text("SELECT pg_advisory_lock(hashtext(:namespace), hashtext(:name))"), + lock_params, + ) + try: + with context.begin_transaction(): + context.run_migrations() + finally: + connection.execute( + sa.text("SELECT pg_advisory_unlock(hashtext(:namespace), hashtext(:name))"), + lock_params, + ) async def run_async_migrations() -> None: diff --git a/surfsense_backend/alembic/versions/143_force_zero_publication_resync.py b/surfsense_backend/alembic/versions/143_force_zero_publication_resync.py index 147cbde56..193d51039 100644 --- a/surfsense_backend/alembic/versions/143_force_zero_publication_resync.py +++ b/surfsense_backend/alembic/versions/143_force_zero_publication_resync.py @@ -47,7 +47,6 @@ depends_on: str | Sequence[str] | None = None PUBLICATION_NAME = "zero_publication" -# Must stay in sync with the column lists in migrations 117 / 139 / 140. DOCUMENT_COLS = [ "id", "title", diff --git a/surfsense_backend/alembic/versions/155_reconcile_zero_publication.py b/surfsense_backend/alembic/versions/155_reconcile_zero_publication.py new file mode 100644 index 000000000..1d2e6ed34 --- /dev/null +++ b/surfsense_backend/alembic/versions/155_reconcile_zero_publication.py @@ -0,0 +1,23 @@ +"""reconcile zero_publication from canonical definition + +Revision ID: 155 +Revises: 154 +""" + +from collections.abc import Sequence + +from alembic import op +from app.zero_publication import apply_publication + +revision: str = "155" +down_revision: str | None = "154" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + apply_publication(op.get_bind()) + + +def downgrade() -> None: + """No-op. Historical publication shapes are immutable.""" diff --git a/surfsense_backend/app/zero_publication.py b/surfsense_backend/app/zero_publication.py new file mode 100644 index 000000000..c3e41ef9b --- /dev/null +++ b/surfsense_backend/app/zero_publication.py @@ -0,0 +1,229 @@ +"""Canonical Zero publication definition for SurfSense. + +This module is the single source of truth for ``zero_publication``. Future +publication changes should update ``ZERO_PUBLICATION`` and call +``apply_publication()`` from a migration instead of hand-copying table lists. + +SurfSense runs Zero on Postgres with Zero's event triggers installed, so the +official Zero path is a plain ``ALTER PUBLICATION ... SET TABLE``. If a future +deployment cannot use event triggers, use Zero's documented +``zero_0.update_schemas()`` hook as the fallback instead of COMMENT bookends. +""" + +from __future__ import annotations + +import argparse +import asyncio +import os +import sys +from collections.abc import Mapping, Sequence + +from sqlalchemy import text +from sqlalchemy.engine import Connection +from sqlalchemy.ext.asyncio import create_async_engine + +PUBLICATION_NAME = "zero_publication" + +DOCUMENT_COLS = [ + "id", + "title", + "document_type", + "search_space_id", + "folder_id", + "created_by_id", + "status", + "created_at", + "updated_at", +] + +USER_COLS = [ + "id", + "pages_limit", + "pages_used", + "premium_credit_micros_limit", + "premium_credit_micros_used", +] + +AUTOMATION_RUN_COLS = [ + "id", + "automation_id", + "trigger_id", + "status", + "step_results", + "started_at", + "finished_at", + "created_at", +] + +ZERO_PUBLICATION: Mapping[str, Sequence[str] | None] = { + "notifications": None, + "documents": DOCUMENT_COLS, + "folders": None, + "search_source_connectors": None, + "new_chat_messages": None, + "chat_comments": None, + "chat_session_state": None, + "user": USER_COLS, + "automation_runs": AUTOMATION_RUN_COLS, +} + + +def _quote_identifier(identifier: str) -> str: + return '"' + identifier.replace('"', '""') + '"' + + +def _column_exists(conn: Connection, table: str, column: str) -> bool: + return ( + conn.execute( + text( + "SELECT 1 FROM information_schema.columns " + "WHERE table_schema = current_schema() " + "AND table_name = :table AND column_name = :column" + ), + {"table": table, "column": column}, + ).fetchone() + is not None + ) + + +def _expected_columns(conn: Connection, table: str) -> list[str] | None: + columns = ZERO_PUBLICATION[table] + if columns is None: + return None + + expected = list(columns) + if table in {"documents", "user"} and _column_exists(conn, table, "_0_version"): + expected.append("_0_version") + return expected + + +def _format_table_entry(conn: Connection, table: str) -> str: + columns = _expected_columns(conn, table) + table_sql = _quote_identifier(table) + if columns is None: + return table_sql + + column_sql = ", ".join(_quote_identifier(column) for column in columns) + return f"{table_sql} ({column_sql})" + + +def build_set_table_sql(conn: Connection) -> str: + """Build the canonical plain SET TABLE statement for Zero's event triggers.""" + + table_list = ", ".join(_format_table_entry(conn, table) for table in ZERO_PUBLICATION) + return f"ALTER PUBLICATION {_quote_identifier(PUBLICATION_NAME)} SET TABLE {table_list}" + + +def apply_publication(conn: Connection) -> None: + """Reconcile ``zero_publication`` to the canonical shape.""" + + exists = conn.execute( + text("SELECT 1 FROM pg_publication WHERE pubname = :name"), + {"name": PUBLICATION_NAME}, + ).fetchone() + if not exists: + return + + conn.execute(text(build_set_table_sql(conn))) + + +def _actual_publication_shape(conn: Connection) -> dict[str, list[str] | None]: + rows = conn.execute( + text( + "SELECT pt.tablename, pr.prattrs IS NULL AS all_columns, pt.attnames " + "FROM pg_publication_tables pt " + "JOIN pg_publication p ON p.pubname = pt.pubname " + "JOIN pg_class c ON c.relname = pt.tablename " + "JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = pt.schemaname " + "JOIN pg_publication_rel pr ON pr.prpubid = p.oid AND pr.prrelid = c.oid " + "WHERE pt.pubname = :name AND pt.schemaname = current_schema() " + "ORDER BY pt.tablename" + ), + {"name": PUBLICATION_NAME}, + ).mappings() + + return { + str(row["tablename"]): None + if row["all_columns"] + else list(row["attnames"] or []) + for row in rows + } + + +def expected_publication_shape(conn: Connection) -> dict[str, list[str] | None]: + return {table: _expected_columns(conn, table) for table in ZERO_PUBLICATION} + + +def verify_publication(conn: Connection) -> list[str]: + """Return human-readable mismatches between Postgres and the canonical shape.""" + + publication_exists = conn.execute( + text("SELECT 1 FROM pg_publication WHERE pubname = :name"), + {"name": PUBLICATION_NAME}, + ).fetchone() + if not publication_exists: + return [f"Publication {PUBLICATION_NAME!r} does not exist"] + + actual = _actual_publication_shape(conn) + expected = expected_publication_shape(conn) + mismatches: list[str] = [] + + for table, expected_columns in expected.items(): + if table not in actual: + mismatches.append(f"{table}: missing from publication") + continue + + actual_columns = actual[table] + actual_key = sorted(actual_columns) if actual_columns is not None else None + expected_key = sorted(expected_columns) if expected_columns is not None else None + if actual_key != expected_key: + mismatches.append( + f"{table}: expected columns {expected_columns or 'ALL'}, " + f"got {actual_columns or 'ALL'}" + ) + + for table in sorted(set(actual) - set(expected)): + mismatches.append(f"{table}: unexpected table in publication") + + return mismatches + + +async def _verify_cli() -> int: + database_url = os.getenv("DATABASE_URL") + if not database_url: + print("DATABASE_URL is required to verify zero_publication.", file=sys.stderr) + return 2 + + engine = create_async_engine(database_url) + async with engine.connect() as async_conn: + def run_verify(sync_conn: Connection) -> list[str]: + return verify_publication(sync_conn) + + mismatches = await async_conn.run_sync(run_verify) + + await engine.dispose() + + if mismatches: + print("zero_publication shape mismatch:", file=sys.stderr) + for mismatch in mismatches: + print(f" - {mismatch}", file=sys.stderr) + return 1 + + print("zero_publication shape verified.") + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Manage SurfSense's Zero publication") + parser.add_argument("--verify", action="store_true", help="verify zero_publication shape") + args = parser.parse_args() + + if args.verify: + return asyncio.run(_verify_cli()) + + parser.print_help() + return 2 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/surfsense_backend/scripts/docker/entrypoint.sh b/surfsense_backend/scripts/docker/entrypoint.sh index 0c1e66790..2efd78f94 100644 --- a/surfsense_backend/scripts/docker/entrypoint.sh +++ b/surfsense_backend/scripts/docker/entrypoint.sh @@ -49,10 +49,10 @@ trap cleanup SIGTERM SIGINT # ── Database migrations (only for migrate / all) ───────────── # Fail-fast contract: # - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s -# - zero_publication must exist in pg_publication afterwards +# - zero_publication must match the canonical app.zero_publication shape # Either failure exits non-zero so the dedicated `migrations` compose # service exits non-zero, halting the rest of the stack instead of -# silently producing a half-built system that crash-loops zero-cache. +# silently producing a drifted Zero publication. run_migrations() { echo "Running database migrations..." for i in {1..30}; do @@ -73,58 +73,13 @@ run_migrations() { fi echo "Migrations completed successfully." - echo "Verifying zero_publication exists in Postgres..." - local pub_oid - pub_oid=$(python <<'PY' 2>/dev/null || true -import asyncio -import sys -from sqlalchemy import text -from app.db import engine - - -async def get_oid(): - async with engine.connect() as conn: - result = await conn.execute( - text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'") - ) - row = result.first() - if row is None: - sys.exit(1) - print(int(row[0])) - - -asyncio.run(get_oid()) -PY -) - if [ -z "${pub_oid}" ]; then - echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2 - echo "This usually means migration 116 (or a later publication migration) did not run." >&2 + echo "Verifying zero_publication matches the canonical shape..." + if ! python -m app.zero_publication --verify; then + echo "ERROR: zero_publication does not match the canonical shape." >&2 echo "Inspect alembic state with:" >&2 echo " docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2 exit 1 fi - echo "zero_publication verified (oid=${pub_oid})." - - # Stale-replica safety net: if /zero-init is mounted (i.e. we are the - # dedicated `migrations` compose service), drop a marker file when the - # publication oid changed (or on first run) so the wrapped zero-cache - # entrypoint can wipe /data/zero.db before starting. This recovers from - # the case where a previous zero-cache crashed mid-init and left a - # half-built SQLite replica without a `_zero.tableMetadata` table. - if [ -d /zero-init ]; then - local stored_oid="" - [ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true) - if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then - echo "Publication oid changed (stored=${stored_oid:-}, current=${pub_oid}); writing /zero-init/needs_reset." - : > /zero-init/needs_reset - chmod 666 /zero-init/needs_reset 2>/dev/null || true - fi - echo "${pub_oid}" > /zero-init/last_pub_oid - chmod 666 /zero-init/last_pub_oid 2>/dev/null || true - # World-writable dir so the (possibly non-root) zero-cache container - # can `rm -f /zero-init/needs_reset` after acting on the marker. - chmod 777 /zero-init 2>/dev/null || true - fi } # ── Service starters ───────────────────────────────────────── diff --git a/surfsense_web/content/docs/docker-installation/docker-compose.mdx b/surfsense_web/content/docs/docker-installation/docker-compose.mdx index 560b64464..8f71ec77e 100644 --- a/surfsense_web/content/docs/docker-installation/docker-compose.mdx +++ b/surfsense_web/content/docs/docker-installation/docker-compose.mdx @@ -327,11 +327,13 @@ Symptom (in `docker compose logs zero-cache`): Error: Unknown or invalid publications. Specified: [zero_publication]. Found: [] ``` -This means `zero-cache` started before `zero_publication` was created. With -the current compose files this should be impossible. The `migrations` -service blocks `zero-cache` from starting. If you see it, your stack -predates the fix or you brought up `zero-cache` manually with `docker -compose up zero-cache` before the migrations service ran. +This means `zero-cache` started before `zero_publication` was created or the +publication does not match SurfSense's canonical Zero shape. With the current +compose files this should be impossible: the `migrations` service blocks +`zero-cache` from starting and verifies the publication before exiting +successfully. If you see it, your stack predates the fix or you brought up +`zero-cache` manually with `docker compose up zero-cache` before the migrations +service ran. Recovery: @@ -341,18 +343,13 @@ docker volume rm surfsense-zero-cache # wipe half-built SQLite replica docker compose up -d # migrations runs first, then zero-cache ``` -The install script (`install.ps1` / `install.sh`) detects this case -automatically: if it finds a `surfsense-zero-cache` volume from a previous -install with no matching `surfsense-zero-init` volume, it removes the stale -volume before bringing the stack up. - ### Zero-cache crashes with `_zero.tableMetadata` errors This indicates a half-initialized SQLite replica left behind by a previous -crash. The `migrations` service writes a marker file on a shared volume -(`surfsense-zero-init`) when the publication oid changes; zero-cache wipes -its replica and re-syncs on next start. If the marker mechanism somehow did -not trigger, run the recovery one-liner above. +crash. Zero's own event triggers and `ZERO_AUTO_RESET` handle schema and +replication halts automatically. If the local SQLite replica is wedged, run the +recovery one-liner above to wipe `surfsense-zero-cache`; zero-cache will +re-sync from Postgres on the next start. ### Ensuring `wal_level = logical`