diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index d6c0a634e..7e5c9b6f0 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -10,6 +10,10 @@ DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense # idle_in_transaction_session_timeout (ms) so an abandoned "idle in transaction" # session can't wedge the DB indefinitely. 0 disables. (asyncpg only) # DB_IDLE_IN_TX_TIMEOUT_MS=900000 +# Same, for the Celery worker engine (long ingestion/podcast/video tasks). If a +# task hasn't touched the DB in this window it's treated as orphaned and dropped. +# 0 disables. (asyncpg only) +# DB_CELERY_IDLE_IN_TX_TIMEOUT_MS=3600000 # Deployment environment: dev or production SURFSENSE_ENV=dev diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 6b090491f..69fb023fe 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -555,6 +555,13 @@ class Config: # connection so an abandoned "idle in transaction" session can't wedge the # database indefinitely. 0 disables. Only applied to asyncpg connections. DB_IDLE_IN_TX_TIMEOUT_MS = int(os.getenv("DB_IDLE_IN_TX_TIMEOUT_MS", "900000")) + # Same protection for the separate Celery worker engine, where long-running + # ingestion/podcast/video tasks live. Kept higher than the web default so a + # legitimate per-document embed window is never reaped: if a task hasn't + # touched the DB in 60 min it's treated as orphaned and dropped. 0 disables. + DB_CELERY_IDLE_IN_TX_TIMEOUT_MS = int( + os.getenv("DB_CELERY_IDLE_IN_TX_TIMEOUT_MS", "3600000") + ) # Celery / Redis # Redis (single endpoint for Celery broker, result backend, and app cache). diff --git a/surfsense_backend/app/tasks/celery_tasks/__init__.py b/surfsense_backend/app/tasks/celery_tasks/__init__.py index 6ea7a2e68..ad439c7b4 100644 --- a/surfsense_backend/app/tasks/celery_tasks/__init__.py +++ b/surfsense_backend/app/tasks/celery_tasks/__init__.py @@ -32,10 +32,22 @@ def get_celery_session_maker() -> async_sessionmaker: """ global _celery_engine, _celery_session_maker if _celery_session_maker is None: + # Reap connections orphaned mid-transaction (e.g. a worker that hung or + # crashed mid-index) so they can't hold locks on documents/chunks and + # wedge writes — the failure mode that previously left an "idle in + # transaction" session holding locks for 11+ hours. Kept generous so a + # legitimate long per-document embed window is never killed. + connect_args: dict = {} + idle_ms = config.DB_CELERY_IDLE_IN_TX_TIMEOUT_MS + if idle_ms and idle_ms > 0 and config.DATABASE_URL and "asyncpg" in config.DATABASE_URL: + connect_args["server_settings"] = { + "idle_in_transaction_session_timeout": str(idle_ms) + } _celery_engine = create_async_engine( config.DATABASE_URL, poolclass=NullPool, echo=False, + connect_args=connect_args, ) with contextlib.suppress(Exception): from app.observability.bootstrap import instrument_sqlalchemy_engine