mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-08 20:25:19 +02:00
feat(docker): add ZERO_AUTO_RESET configuration for improved replication safety
- Introduced the ZERO_AUTO_RESET environment variable to enable automatic reset of the SQLite replica in case of replication halts. - Updated Docker Compose files to include ZERO_AUTO_RESET in service configurations. - Enhanced documentation to clarify the purpose and usage of the new variable.
This commit is contained in:
parent
19fabaf011
commit
4e00f24a03
12 changed files with 304 additions and 151 deletions
|
|
@ -102,6 +102,10 @@ EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
|||
# Only change this if you manage publications manually.
|
||||
# ZERO_APP_PUBLICATIONS=zero_publication
|
||||
|
||||
# Keep Zero's documented halt safety net enabled. If replication halts, Zero
|
||||
# can wipe and re-sync its local SQLite replica without touching Postgres.
|
||||
# ZERO_AUTO_RESET=true
|
||||
|
||||
# Sync worker tuning. zero-cache defaults ZERO_NUM_SYNC_WORKERS to the number
|
||||
# of CPU cores, which can exceed the connection pool limits on high-core machines.
|
||||
# Each sync worker needs at least 1 connection from both the UPSTREAM and CVR
|
||||
|
|
|
|||
|
|
@ -114,6 +114,7 @@ services:
|
|||
- ZERO_REPLICA_FILE=/data/zero.db
|
||||
- ZERO_ADMIN_PASSWORD=${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin}
|
||||
- ZERO_APP_PUBLICATIONS=${ZERO_APP_PUBLICATIONS:-zero_publication}
|
||||
- ZERO_AUTO_RESET=${ZERO_AUTO_RESET:-true}
|
||||
- ZERO_NUM_SYNC_WORKERS=${ZERO_NUM_SYNC_WORKERS:-4}
|
||||
- ZERO_UPSTREAM_MAX_CONNS=${ZERO_UPSTREAM_MAX_CONNS:-20}
|
||||
- ZERO_CVR_MAX_CONNS=${ZERO_CVR_MAX_CONNS:-30}
|
||||
|
|
@ -122,11 +123,13 @@ services:
|
|||
volumes:
|
||||
- zero_cache_data:/data
|
||||
restart: unless-stopped
|
||||
stop_grace_period: 300s
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 600s
|
||||
|
||||
# OPTIONAL — Azurite emulates Azure Blob Storage for testing the Azure
|
||||
# original-file backend. The default filesystem backend needs none of this.
|
||||
|
|
|
|||
|
|
@ -46,8 +46,6 @@ services:
|
|||
- PYTHONPATH=/app
|
||||
- SERVICE_ROLE=migrate
|
||||
- MIGRATION_TIMEOUT=${MIGRATION_TIMEOUT:-900}
|
||||
volumes:
|
||||
- zero_init:/zero-init
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
|
|
@ -235,6 +233,7 @@ services:
|
|||
- ZERO_REPLICA_FILE=/data/zero.db
|
||||
- ZERO_ADMIN_PASSWORD=${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin}
|
||||
- ZERO_APP_PUBLICATIONS=${ZERO_APP_PUBLICATIONS:-zero_publication}
|
||||
- ZERO_AUTO_RESET=${ZERO_AUTO_RESET:-true}
|
||||
- ZERO_NUM_SYNC_WORKERS=${ZERO_NUM_SYNC_WORKERS:-4}
|
||||
- ZERO_UPSTREAM_MAX_CONNS=${ZERO_UPSTREAM_MAX_CONNS:-20}
|
||||
- ZERO_CVR_MAX_CONNS=${ZERO_CVR_MAX_CONNS:-30}
|
||||
|
|
@ -242,18 +241,14 @@ services:
|
|||
- ZERO_MUTATE_URL=${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
|
||||
volumes:
|
||||
- zero_cache_data:/data
|
||||
- zero_init:/zero-init
|
||||
# Wrapper: see docker/docker-compose.yml `zero-cache` for rationale.
|
||||
entrypoint: ["sh", "-c"]
|
||||
# Pass the script as a single list element so Compose does not tokenize it.
|
||||
command:
|
||||
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
|
||||
restart: unless-stopped
|
||||
stop_grace_period: 300s
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:4848/keepalive"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 600s
|
||||
|
||||
frontend:
|
||||
build:
|
||||
|
|
@ -285,7 +280,5 @@ volumes:
|
|||
name: surfsense-dev-shared-temp
|
||||
zero_cache_data:
|
||||
name: surfsense-dev-zero-cache
|
||||
zero_init:
|
||||
name: surfsense-dev-zero-init
|
||||
whatsapp_sessions:
|
||||
name: surfsense-dev-whatsapp-sessions
|
||||
|
|
|
|||
|
|
@ -29,10 +29,9 @@ services:
|
|||
|
||||
# Short-lived schema runner. Executes `alembic upgrade head` and verifies
|
||||
# that the `zero_publication` Postgres logical-replication publication
|
||||
# exists, then exits 0. Downstream services (backend, celery_*, zero-cache)
|
||||
# gate on this with `condition: service_completed_successfully` so a failed
|
||||
# migration halts the whole stack instead of silently producing a half-built
|
||||
# system that crash-loops zero-cache on missing publications.
|
||||
# matches the canonical shape, then exits 0. Downstream services gate on this
|
||||
# with `condition: service_completed_successfully` so a failed migration halts
|
||||
# the whole stack instead of booting zero-cache against a drifted publication.
|
||||
migrations:
|
||||
image: ghcr.io/modsetter/surfsense-backend:${SURFSENSE_VERSION:-latest}${SURFSENSE_VARIANT:+-${SURFSENSE_VARIANT}}
|
||||
env_file:
|
||||
|
|
@ -42,8 +41,6 @@ services:
|
|||
PYTHONPATH: /app
|
||||
SERVICE_ROLE: migrate
|
||||
MIGRATION_TIMEOUT: ${MIGRATION_TIMEOUT:-900}
|
||||
volumes:
|
||||
- zero_init:/zero-init
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
|
|
@ -231,6 +228,7 @@ services:
|
|||
ZERO_REPLICA_FILE: /data/zero.db
|
||||
ZERO_ADMIN_PASSWORD: ${ZERO_ADMIN_PASSWORD:-surfsense-zero-admin}
|
||||
ZERO_APP_PUBLICATIONS: ${ZERO_APP_PUBLICATIONS:-zero_publication}
|
||||
ZERO_AUTO_RESET: ${ZERO_AUTO_RESET:-true}
|
||||
ZERO_NUM_SYNC_WORKERS: ${ZERO_NUM_SYNC_WORKERS:-4}
|
||||
ZERO_UPSTREAM_MAX_CONNS: ${ZERO_UPSTREAM_MAX_CONNS:-20}
|
||||
ZERO_CVR_MAX_CONNS: ${ZERO_CVR_MAX_CONNS:-30}
|
||||
|
|
@ -238,16 +236,8 @@ services:
|
|||
ZERO_MUTATE_URL: ${ZERO_MUTATE_URL:-http://frontend:3000/api/zero/mutate}
|
||||
volumes:
|
||||
- zero_cache_data:/data
|
||||
- zero_init:/zero-init
|
||||
# Wrapper: if the migrations service flagged a publication change via
|
||||
# /zero-init/needs_reset, wipe the SQLite replica before starting so
|
||||
# zero-cache does a clean initial sync. Recovers from the half-built
|
||||
# replica state (`_zero.tableMetadata` missing) caused by earlier crashes.
|
||||
entrypoint: ["sh", "-c"]
|
||||
# Pass the script as a single list element so Compose does not tokenize it.
|
||||
command:
|
||||
- 'if [ -f /zero-init/needs_reset ]; then echo "[zero-init] publication change detected; wiping replica file(s) under /data" && rm -f /data/zero.db /data/zero.db-shm /data/zero.db-wal && rm -f /zero-init/needs_reset; fi; exec zero-cache'
|
||||
restart: unless-stopped
|
||||
stop_grace_period: 300s
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
|
|
@ -258,6 +248,7 @@ services:
|
|||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 600s
|
||||
|
||||
frontend:
|
||||
image: ghcr.io/modsetter/surfsense-web:${SURFSENSE_VERSION:-latest}
|
||||
|
|
@ -289,7 +280,5 @@ volumes:
|
|||
name: surfsense-shared-temp
|
||||
zero_cache_data:
|
||||
name: surfsense-zero-cache
|
||||
zero_init:
|
||||
name: surfsense-zero-init
|
||||
whatsapp_sessions:
|
||||
name: surfsense-whatsapp-sessions
|
||||
|
|
|
|||
|
|
@ -153,34 +153,6 @@ function Wait-ForPostgres {
|
|||
|
||||
# ── Stack startup helper ────────────────────────────────────────────────────
|
||||
|
||||
function Test-StaleZeroCacheVolume {
|
||||
$raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null }
|
||||
if ([string]::IsNullOrWhiteSpace($raw)) { return $false }
|
||||
$names = $raw -split "`r?`n" | ForEach-Object { $_.Trim() } | Where-Object { $_ }
|
||||
$hasZeroCache = $names -contains 'surfsense-zero-cache'
|
||||
$hasZeroInit = $names -contains 'surfsense-zero-init'
|
||||
# Pre-fix installs created surfsense-zero-cache but never surfsense-zero-init.
|
||||
# Such a volume may hold a half-initialized SQLite replica from an earlier
|
||||
# crash-loop. Wiping it forces zero-cache to do a fresh initial sync.
|
||||
return ($hasZeroCache -and -not $hasZeroInit)
|
||||
}
|
||||
|
||||
function Invoke-StaleZeroCacheCleanup {
|
||||
if (-not (Test-StaleZeroCacheVolume)) { return }
|
||||
|
||||
Write-Warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
|
||||
Write-Warn "predates the migrations-service fix. It may contain a half-initialized"
|
||||
Write-Warn "SQLite replica that would block zero-cache from starting."
|
||||
Write-Warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
|
||||
Start-Sleep -Seconds 5
|
||||
|
||||
Push-Location $InstallDir
|
||||
Invoke-NativeSafe { docker compose down --remove-orphans 2>$null } | Out-Null
|
||||
Pop-Location
|
||||
Invoke-NativeSafe { docker volume rm surfsense-zero-cache 2>$null } | Out-Null
|
||||
Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
|
||||
}
|
||||
|
||||
function Invoke-StackFailureReport {
|
||||
Write-Host ""
|
||||
Write-Host "[ERROR] Stack did not reach a healthy state." -ForegroundColor Red
|
||||
|
|
@ -443,8 +415,6 @@ if (-not (Test-Path $envPath)) {
|
|||
|
||||
# ── Start containers ────────────────────────────────────────────────────────
|
||||
|
||||
Invoke-StaleZeroCacheCleanup
|
||||
|
||||
if ($MigrationMode) {
|
||||
$envContent = Get-Content $envPath
|
||||
$DbUser = ($envContent | Select-String '^DB_USER=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1
|
||||
|
|
|
|||
|
|
@ -189,31 +189,6 @@ compose_up_wait() {
|
|||
fi
|
||||
}
|
||||
|
||||
# True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not.
|
||||
# That signals an install that predates the migrations-service fix; the old
|
||||
# replica may be half-initialized and would block zero-cache on next start.
|
||||
test_stale_zero_cache_volume() {
|
||||
local has_zc has_zi
|
||||
has_zc=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-cache' || true)
|
||||
has_zi=$(docker volume ls --format '{{.Name}}' 2>/dev/null | grep -Fx 'surfsense-zero-init' || true)
|
||||
[[ -n "$has_zc" && -z "$has_zi" ]]
|
||||
}
|
||||
|
||||
invoke_stale_zero_cache_cleanup() {
|
||||
if ! test_stale_zero_cache_volume; then
|
||||
return 0
|
||||
fi
|
||||
warn "Detected pre-existing 'surfsense-zero-cache' volume from an install that"
|
||||
warn "predates the migrations-service fix. It may contain a half-initialized"
|
||||
warn "SQLite replica that would block zero-cache from starting."
|
||||
warn "The volume will be removed in 5 seconds; press Ctrl+C to cancel."
|
||||
sleep 5
|
||||
|
||||
(cd "${INSTALL_DIR}" && ${DC} down --remove-orphans 2>/dev/null) || true
|
||||
docker volume rm surfsense-zero-cache 2>/dev/null || true
|
||||
success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start."
|
||||
}
|
||||
|
||||
# ── Variant and .env helpers ─────────────────────────────────────────────────
|
||||
|
||||
set_env_value() {
|
||||
|
|
@ -448,8 +423,6 @@ fi
|
|||
|
||||
# ── Start containers ─────────────────────────────────────────────────────────
|
||||
|
||||
invoke_stale_zero_cache_cleanup
|
||||
|
||||
if $MIGRATION_MODE; then
|
||||
# Read DB credentials from .env (fall back to defaults from docker-compose.yml)
|
||||
DB_USER=$(grep '^DB_USER=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import os
|
|||
import sys
|
||||
from logging.config import fileConfig
|
||||
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy import pool
|
||||
from sqlalchemy.engine import Connection
|
||||
from sqlalchemy.ext.asyncio import async_engine_from_config
|
||||
|
|
@ -36,6 +37,9 @@ if config.config_file_name is not None:
|
|||
# target_metadata = mymodel.Base.metadata
|
||||
target_metadata = Base.metadata
|
||||
|
||||
MIGRATION_ADVISORY_LOCK_NAMESPACE = "surfsense"
|
||||
MIGRATION_ADVISORY_LOCK_NAME = "alembic_migrations"
|
||||
|
||||
# other values from the config, defined by the needs of env.py,
|
||||
# can be acquired:
|
||||
# my_important_option = config.get_main_option("my_important_option")
|
||||
|
|
@ -73,8 +77,22 @@ def do_run_migrations(connection: Connection) -> None:
|
|||
transaction_per_migration=True,
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
lock_params = {
|
||||
"namespace": MIGRATION_ADVISORY_LOCK_NAMESPACE,
|
||||
"name": MIGRATION_ADVISORY_LOCK_NAME,
|
||||
}
|
||||
connection.execute(
|
||||
sa.text("SELECT pg_advisory_lock(hashtext(:namespace), hashtext(:name))"),
|
||||
lock_params,
|
||||
)
|
||||
try:
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
finally:
|
||||
connection.execute(
|
||||
sa.text("SELECT pg_advisory_unlock(hashtext(:namespace), hashtext(:name))"),
|
||||
lock_params,
|
||||
)
|
||||
|
||||
|
||||
async def run_async_migrations() -> None:
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@ depends_on: str | Sequence[str] | None = None
|
|||
|
||||
PUBLICATION_NAME = "zero_publication"
|
||||
|
||||
# Must stay in sync with the column lists in migrations 117 / 139 / 140.
|
||||
DOCUMENT_COLS = [
|
||||
"id",
|
||||
"title",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,23 @@
|
|||
"""reconcile zero_publication from canonical definition
|
||||
|
||||
Revision ID: 155
|
||||
Revises: 154
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
from app.zero_publication import apply_publication
|
||||
|
||||
revision: str = "155"
|
||||
down_revision: str | None = "154"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
apply_publication(op.get_bind())
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""No-op. Historical publication shapes are immutable."""
|
||||
229
surfsense_backend/app/zero_publication.py
Normal file
229
surfsense_backend/app/zero_publication.py
Normal file
|
|
@ -0,0 +1,229 @@
|
|||
"""Canonical Zero publication definition for SurfSense.
|
||||
|
||||
This module is the single source of truth for ``zero_publication``. Future
|
||||
publication changes should update ``ZERO_PUBLICATION`` and call
|
||||
``apply_publication()`` from a migration instead of hand-copying table lists.
|
||||
|
||||
SurfSense runs Zero on Postgres with Zero's event triggers installed, so the
|
||||
official Zero path is a plain ``ALTER PUBLICATION ... SET TABLE``. If a future
|
||||
deployment cannot use event triggers, use Zero's documented
|
||||
``zero_0.update_schemas()`` hook as the fallback instead of COMMENT bookends.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from collections.abc import Mapping, Sequence
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.engine import Connection
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
|
||||
PUBLICATION_NAME = "zero_publication"
|
||||
|
||||
DOCUMENT_COLS = [
|
||||
"id",
|
||||
"title",
|
||||
"document_type",
|
||||
"search_space_id",
|
||||
"folder_id",
|
||||
"created_by_id",
|
||||
"status",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
]
|
||||
|
||||
USER_COLS = [
|
||||
"id",
|
||||
"pages_limit",
|
||||
"pages_used",
|
||||
"premium_credit_micros_limit",
|
||||
"premium_credit_micros_used",
|
||||
]
|
||||
|
||||
AUTOMATION_RUN_COLS = [
|
||||
"id",
|
||||
"automation_id",
|
||||
"trigger_id",
|
||||
"status",
|
||||
"step_results",
|
||||
"started_at",
|
||||
"finished_at",
|
||||
"created_at",
|
||||
]
|
||||
|
||||
ZERO_PUBLICATION: Mapping[str, Sequence[str] | None] = {
|
||||
"notifications": None,
|
||||
"documents": DOCUMENT_COLS,
|
||||
"folders": None,
|
||||
"search_source_connectors": None,
|
||||
"new_chat_messages": None,
|
||||
"chat_comments": None,
|
||||
"chat_session_state": None,
|
||||
"user": USER_COLS,
|
||||
"automation_runs": AUTOMATION_RUN_COLS,
|
||||
}
|
||||
|
||||
|
||||
def _quote_identifier(identifier: str) -> str:
|
||||
return '"' + identifier.replace('"', '""') + '"'
|
||||
|
||||
|
||||
def _column_exists(conn: Connection, table: str, column: str) -> bool:
|
||||
return (
|
||||
conn.execute(
|
||||
text(
|
||||
"SELECT 1 FROM information_schema.columns "
|
||||
"WHERE table_schema = current_schema() "
|
||||
"AND table_name = :table AND column_name = :column"
|
||||
),
|
||||
{"table": table, "column": column},
|
||||
).fetchone()
|
||||
is not None
|
||||
)
|
||||
|
||||
|
||||
def _expected_columns(conn: Connection, table: str) -> list[str] | None:
|
||||
columns = ZERO_PUBLICATION[table]
|
||||
if columns is None:
|
||||
return None
|
||||
|
||||
expected = list(columns)
|
||||
if table in {"documents", "user"} and _column_exists(conn, table, "_0_version"):
|
||||
expected.append("_0_version")
|
||||
return expected
|
||||
|
||||
|
||||
def _format_table_entry(conn: Connection, table: str) -> str:
|
||||
columns = _expected_columns(conn, table)
|
||||
table_sql = _quote_identifier(table)
|
||||
if columns is None:
|
||||
return table_sql
|
||||
|
||||
column_sql = ", ".join(_quote_identifier(column) for column in columns)
|
||||
return f"{table_sql} ({column_sql})"
|
||||
|
||||
|
||||
def build_set_table_sql(conn: Connection) -> str:
|
||||
"""Build the canonical plain SET TABLE statement for Zero's event triggers."""
|
||||
|
||||
table_list = ", ".join(_format_table_entry(conn, table) for table in ZERO_PUBLICATION)
|
||||
return f"ALTER PUBLICATION {_quote_identifier(PUBLICATION_NAME)} SET TABLE {table_list}"
|
||||
|
||||
|
||||
def apply_publication(conn: Connection) -> None:
|
||||
"""Reconcile ``zero_publication`` to the canonical shape."""
|
||||
|
||||
exists = conn.execute(
|
||||
text("SELECT 1 FROM pg_publication WHERE pubname = :name"),
|
||||
{"name": PUBLICATION_NAME},
|
||||
).fetchone()
|
||||
if not exists:
|
||||
return
|
||||
|
||||
conn.execute(text(build_set_table_sql(conn)))
|
||||
|
||||
|
||||
def _actual_publication_shape(conn: Connection) -> dict[str, list[str] | None]:
|
||||
rows = conn.execute(
|
||||
text(
|
||||
"SELECT pt.tablename, pr.prattrs IS NULL AS all_columns, pt.attnames "
|
||||
"FROM pg_publication_tables pt "
|
||||
"JOIN pg_publication p ON p.pubname = pt.pubname "
|
||||
"JOIN pg_class c ON c.relname = pt.tablename "
|
||||
"JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = pt.schemaname "
|
||||
"JOIN pg_publication_rel pr ON pr.prpubid = p.oid AND pr.prrelid = c.oid "
|
||||
"WHERE pt.pubname = :name AND pt.schemaname = current_schema() "
|
||||
"ORDER BY pt.tablename"
|
||||
),
|
||||
{"name": PUBLICATION_NAME},
|
||||
).mappings()
|
||||
|
||||
return {
|
||||
str(row["tablename"]): None
|
||||
if row["all_columns"]
|
||||
else list(row["attnames"] or [])
|
||||
for row in rows
|
||||
}
|
||||
|
||||
|
||||
def expected_publication_shape(conn: Connection) -> dict[str, list[str] | None]:
|
||||
return {table: _expected_columns(conn, table) for table in ZERO_PUBLICATION}
|
||||
|
||||
|
||||
def verify_publication(conn: Connection) -> list[str]:
|
||||
"""Return human-readable mismatches between Postgres and the canonical shape."""
|
||||
|
||||
publication_exists = conn.execute(
|
||||
text("SELECT 1 FROM pg_publication WHERE pubname = :name"),
|
||||
{"name": PUBLICATION_NAME},
|
||||
).fetchone()
|
||||
if not publication_exists:
|
||||
return [f"Publication {PUBLICATION_NAME!r} does not exist"]
|
||||
|
||||
actual = _actual_publication_shape(conn)
|
||||
expected = expected_publication_shape(conn)
|
||||
mismatches: list[str] = []
|
||||
|
||||
for table, expected_columns in expected.items():
|
||||
if table not in actual:
|
||||
mismatches.append(f"{table}: missing from publication")
|
||||
continue
|
||||
|
||||
actual_columns = actual[table]
|
||||
actual_key = sorted(actual_columns) if actual_columns is not None else None
|
||||
expected_key = sorted(expected_columns) if expected_columns is not None else None
|
||||
if actual_key != expected_key:
|
||||
mismatches.append(
|
||||
f"{table}: expected columns {expected_columns or 'ALL'}, "
|
||||
f"got {actual_columns or 'ALL'}"
|
||||
)
|
||||
|
||||
for table in sorted(set(actual) - set(expected)):
|
||||
mismatches.append(f"{table}: unexpected table in publication")
|
||||
|
||||
return mismatches
|
||||
|
||||
|
||||
async def _verify_cli() -> int:
|
||||
database_url = os.getenv("DATABASE_URL")
|
||||
if not database_url:
|
||||
print("DATABASE_URL is required to verify zero_publication.", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
engine = create_async_engine(database_url)
|
||||
async with engine.connect() as async_conn:
|
||||
def run_verify(sync_conn: Connection) -> list[str]:
|
||||
return verify_publication(sync_conn)
|
||||
|
||||
mismatches = await async_conn.run_sync(run_verify)
|
||||
|
||||
await engine.dispose()
|
||||
|
||||
if mismatches:
|
||||
print("zero_publication shape mismatch:", file=sys.stderr)
|
||||
for mismatch in mismatches:
|
||||
print(f" - {mismatch}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print("zero_publication shape verified.")
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Manage SurfSense's Zero publication")
|
||||
parser.add_argument("--verify", action="store_true", help="verify zero_publication shape")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.verify:
|
||||
return asyncio.run(_verify_cli())
|
||||
|
||||
parser.print_help()
|
||||
return 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
|
@ -49,10 +49,10 @@ trap cleanup SIGTERM SIGINT
|
|||
# ── Database migrations (only for migrate / all) ─────────────
|
||||
# Fail-fast contract:
|
||||
# - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s
|
||||
# - zero_publication must exist in pg_publication afterwards
|
||||
# - zero_publication must match the canonical app.zero_publication shape
|
||||
# Either failure exits non-zero so the dedicated `migrations` compose
|
||||
# service exits non-zero, halting the rest of the stack instead of
|
||||
# silently producing a half-built system that crash-loops zero-cache.
|
||||
# silently producing a drifted Zero publication.
|
||||
run_migrations() {
|
||||
echo "Running database migrations..."
|
||||
for i in {1..30}; do
|
||||
|
|
@ -73,58 +73,13 @@ run_migrations() {
|
|||
fi
|
||||
echo "Migrations completed successfully."
|
||||
|
||||
echo "Verifying zero_publication exists in Postgres..."
|
||||
local pub_oid
|
||||
pub_oid=$(python <<'PY' 2>/dev/null || true
|
||||
import asyncio
|
||||
import sys
|
||||
from sqlalchemy import text
|
||||
from app.db import engine
|
||||
|
||||
|
||||
async def get_oid():
|
||||
async with engine.connect() as conn:
|
||||
result = await conn.execute(
|
||||
text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'")
|
||||
)
|
||||
row = result.first()
|
||||
if row is None:
|
||||
sys.exit(1)
|
||||
print(int(row[0]))
|
||||
|
||||
|
||||
asyncio.run(get_oid())
|
||||
PY
|
||||
)
|
||||
if [ -z "${pub_oid}" ]; then
|
||||
echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2
|
||||
echo "This usually means migration 116 (or a later publication migration) did not run." >&2
|
||||
echo "Verifying zero_publication matches the canonical shape..."
|
||||
if ! python -m app.zero_publication --verify; then
|
||||
echo "ERROR: zero_publication does not match the canonical shape." >&2
|
||||
echo "Inspect alembic state with:" >&2
|
||||
echo " docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "zero_publication verified (oid=${pub_oid})."
|
||||
|
||||
# Stale-replica safety net: if /zero-init is mounted (i.e. we are the
|
||||
# dedicated `migrations` compose service), drop a marker file when the
|
||||
# publication oid changed (or on first run) so the wrapped zero-cache
|
||||
# entrypoint can wipe /data/zero.db before starting. This recovers from
|
||||
# the case where a previous zero-cache crashed mid-init and left a
|
||||
# half-built SQLite replica without a `_zero.tableMetadata` table.
|
||||
if [ -d /zero-init ]; then
|
||||
local stored_oid=""
|
||||
[ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true)
|
||||
if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then
|
||||
echo "Publication oid changed (stored=${stored_oid:-<none>}, current=${pub_oid}); writing /zero-init/needs_reset."
|
||||
: > /zero-init/needs_reset
|
||||
chmod 666 /zero-init/needs_reset 2>/dev/null || true
|
||||
fi
|
||||
echo "${pub_oid}" > /zero-init/last_pub_oid
|
||||
chmod 666 /zero-init/last_pub_oid 2>/dev/null || true
|
||||
# World-writable dir so the (possibly non-root) zero-cache container
|
||||
# can `rm -f /zero-init/needs_reset` after acting on the marker.
|
||||
chmod 777 /zero-init 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Service starters ─────────────────────────────────────────
|
||||
|
|
|
|||
|
|
@ -327,11 +327,13 @@ Symptom (in `docker compose logs zero-cache`):
|
|||
Error: Unknown or invalid publications. Specified: [zero_publication]. Found: []
|
||||
```
|
||||
|
||||
This means `zero-cache` started before `zero_publication` was created. With
|
||||
the current compose files this should be impossible. The `migrations`
|
||||
service blocks `zero-cache` from starting. If you see it, your stack
|
||||
predates the fix or you brought up `zero-cache` manually with `docker
|
||||
compose up zero-cache` before the migrations service ran.
|
||||
This means `zero-cache` started before `zero_publication` was created or the
|
||||
publication does not match SurfSense's canonical Zero shape. With the current
|
||||
compose files this should be impossible: the `migrations` service blocks
|
||||
`zero-cache` from starting and verifies the publication before exiting
|
||||
successfully. If you see it, your stack predates the fix or you brought up
|
||||
`zero-cache` manually with `docker compose up zero-cache` before the migrations
|
||||
service ran.
|
||||
|
||||
Recovery:
|
||||
|
||||
|
|
@ -341,18 +343,13 @@ docker volume rm surfsense-zero-cache # wipe half-built SQLite replica
|
|||
docker compose up -d # migrations runs first, then zero-cache
|
||||
```
|
||||
|
||||
The install script (`install.ps1` / `install.sh`) detects this case
|
||||
automatically: if it finds a `surfsense-zero-cache` volume from a previous
|
||||
install with no matching `surfsense-zero-init` volume, it removes the stale
|
||||
volume before bringing the stack up.
|
||||
|
||||
### Zero-cache crashes with `_zero.tableMetadata` errors
|
||||
|
||||
This indicates a half-initialized SQLite replica left behind by a previous
|
||||
crash. The `migrations` service writes a marker file on a shared volume
|
||||
(`surfsense-zero-init`) when the publication oid changes; zero-cache wipes
|
||||
its replica and re-syncs on next start. If the marker mechanism somehow did
|
||||
not trigger, run the recovery one-liner above.
|
||||
crash. Zero's own event triggers and `ZERO_AUTO_RESET` handle schema and
|
||||
replication halts automatically. If the local SQLite replica is wedged, run the
|
||||
recovery one-liner above to wipe `surfsense-zero-cache`; zero-cache will
|
||||
re-sync from Postgres on the next start.
|
||||
|
||||
### Ensuring `wal_level = logical`
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue