fix(db): reap orphaned idle-in-transaction sessions on the Celery engine

The long-running ingestion/podcast/video tasks run on a separate Celery
engine (NullPool), so the web engine's idle_in_transaction_session_timeout
did not cover them — which is exactly where the original 11h zombie
(INSERT INTO chunks) came from. Apply the same protection to the Celery
engine with a generous 60-minute default so a worker that hangs/crashes
mid-transaction can't hold locks on documents/chunks indefinitely, while
never reaping a legitimate per-document embed window.

- config + .env.example: DB_CELERY_IDLE_IN_TX_TIMEOUT_MS (default 3600000).

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-06-16 16:26:04 -07:00
parent 89cc3b37ee
commit da64433439
3 changed files with 23 additions and 0 deletions

View file

@ -10,6 +10,10 @@ DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
# idle_in_transaction_session_timeout (ms) so an abandoned "idle in transaction"
# session can't wedge the DB indefinitely. 0 disables. (asyncpg only)
# DB_IDLE_IN_TX_TIMEOUT_MS=900000
# Same, for the Celery worker engine (long ingestion/podcast/video tasks). If a
# task hasn't touched the DB in this window it's treated as orphaned and dropped.
# 0 disables. (asyncpg only)
# DB_CELERY_IDLE_IN_TX_TIMEOUT_MS=3600000
# Deployment environment: dev or production
SURFSENSE_ENV=dev

View file

@ -555,6 +555,13 @@ class Config:
# connection so an abandoned "idle in transaction" session can't wedge the
# database indefinitely. 0 disables. Only applied to asyncpg connections.
DB_IDLE_IN_TX_TIMEOUT_MS = int(os.getenv("DB_IDLE_IN_TX_TIMEOUT_MS", "900000"))
# Same protection for the separate Celery worker engine, where long-running
# ingestion/podcast/video tasks live. Kept higher than the web default so a
# legitimate per-document embed window is never reaped: if a task hasn't
# touched the DB in 60 min it's treated as orphaned and dropped. 0 disables.
DB_CELERY_IDLE_IN_TX_TIMEOUT_MS = int(
os.getenv("DB_CELERY_IDLE_IN_TX_TIMEOUT_MS", "3600000")
)
# Celery / Redis
# Redis (single endpoint for Celery broker, result backend, and app cache).

View file

@ -32,10 +32,22 @@ def get_celery_session_maker() -> async_sessionmaker:
"""
global _celery_engine, _celery_session_maker
if _celery_session_maker is None:
# Reap connections orphaned mid-transaction (e.g. a worker that hung or
# crashed mid-index) so they can't hold locks on documents/chunks and
# wedge writes — the failure mode that previously left an "idle in
# transaction" session holding locks for 11+ hours. Kept generous so a
# legitimate long per-document embed window is never killed.
connect_args: dict = {}
idle_ms = config.DB_CELERY_IDLE_IN_TX_TIMEOUT_MS
if idle_ms and idle_ms > 0 and config.DATABASE_URL and "asyncpg" in config.DATABASE_URL:
connect_args["server_settings"] = {
"idle_in_transaction_session_timeout": str(idle_ms)
}
_celery_engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool,
echo=False,
connect_args=connect_args,
)
with contextlib.suppress(Exception):
from app.observability.bootstrap import instrument_sqlalchemy_engine