mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-18 21:15:16 +02:00
fix(db): reap orphaned idle-in-transaction sessions on the Celery engine
The long-running ingestion/podcast/video tasks run on a separate Celery engine (NullPool), so the web engine's idle_in_transaction_session_timeout did not cover them — which is exactly where the original 11h zombie (INSERT INTO chunks) came from. Apply the same protection to the Celery engine with a generous 60-minute default so a worker that hangs/crashes mid-transaction can't hold locks on documents/chunks indefinitely, while never reaping a legitimate per-document embed window. - config + .env.example: DB_CELERY_IDLE_IN_TX_TIMEOUT_MS (default 3600000). Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
89cc3b37ee
commit
da64433439
3 changed files with 23 additions and 0 deletions
|
|
@ -10,6 +10,10 @@ DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
|
|||
# idle_in_transaction_session_timeout (ms) so an abandoned "idle in transaction"
|
||||
# session can't wedge the DB indefinitely. 0 disables. (asyncpg only)
|
||||
# DB_IDLE_IN_TX_TIMEOUT_MS=900000
|
||||
# Same, for the Celery worker engine (long ingestion/podcast/video tasks). If a
|
||||
# task hasn't touched the DB in this window it's treated as orphaned and dropped.
|
||||
# 0 disables. (asyncpg only)
|
||||
# DB_CELERY_IDLE_IN_TX_TIMEOUT_MS=3600000
|
||||
|
||||
# Deployment environment: dev or production
|
||||
SURFSENSE_ENV=dev
|
||||
|
|
|
|||
|
|
@ -555,6 +555,13 @@ class Config:
|
|||
# connection so an abandoned "idle in transaction" session can't wedge the
|
||||
# database indefinitely. 0 disables. Only applied to asyncpg connections.
|
||||
DB_IDLE_IN_TX_TIMEOUT_MS = int(os.getenv("DB_IDLE_IN_TX_TIMEOUT_MS", "900000"))
|
||||
# Same protection for the separate Celery worker engine, where long-running
|
||||
# ingestion/podcast/video tasks live. Kept higher than the web default so a
|
||||
# legitimate per-document embed window is never reaped: if a task hasn't
|
||||
# touched the DB in 60 min it's treated as orphaned and dropped. 0 disables.
|
||||
DB_CELERY_IDLE_IN_TX_TIMEOUT_MS = int(
|
||||
os.getenv("DB_CELERY_IDLE_IN_TX_TIMEOUT_MS", "3600000")
|
||||
)
|
||||
|
||||
# Celery / Redis
|
||||
# Redis (single endpoint for Celery broker, result backend, and app cache).
|
||||
|
|
|
|||
|
|
@ -32,10 +32,22 @@ def get_celery_session_maker() -> async_sessionmaker:
|
|||
"""
|
||||
global _celery_engine, _celery_session_maker
|
||||
if _celery_session_maker is None:
|
||||
# Reap connections orphaned mid-transaction (e.g. a worker that hung or
|
||||
# crashed mid-index) so they can't hold locks on documents/chunks and
|
||||
# wedge writes — the failure mode that previously left an "idle in
|
||||
# transaction" session holding locks for 11+ hours. Kept generous so a
|
||||
# legitimate long per-document embed window is never killed.
|
||||
connect_args: dict = {}
|
||||
idle_ms = config.DB_CELERY_IDLE_IN_TX_TIMEOUT_MS
|
||||
if idle_ms and idle_ms > 0 and config.DATABASE_URL and "asyncpg" in config.DATABASE_URL:
|
||||
connect_args["server_settings"] = {
|
||||
"idle_in_transaction_session_timeout": str(idle_ms)
|
||||
}
|
||||
_celery_engine = create_async_engine(
|
||||
config.DATABASE_URL,
|
||||
poolclass=NullPool,
|
||||
echo=False,
|
||||
connect_args=connect_args,
|
||||
)
|
||||
with contextlib.suppress(Exception):
|
||||
from app.observability.bootstrap import instrument_sqlalchemy_engine
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue