feat: implement Redis heartbeat tracking for connector indexing tasks and update stale notification cleanup logic

2026-05-25 19:15:18 +02:00 · 2026-02-02 00:18:47 +05:30 · 2026-02-02 00:18:47 +05:30 · 05d1d6ac04
commit 05d1d6ac04
parent 085653d3e3
2 changed files with 139 additions and 69 deletions
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -19,10 +19,12 @@ Non-OAuth connectors (BookStack, GitHub, etc.) are limited to one per search spa
 """
 import logging
 import os
 from datetime import UTC, datetime, timedelta
 from typing import Any
 import pytz
 import redis
 from dateutil.parser import isoparse
 from fastapi import APIRouter, Body, Depends, HTTPException, Query
 from pydantic import BaseModel, Field, ValidationError
@ -78,6 +80,27 @@ from app.utils.rbac import check_permission
 # Set up logging
 logger = logging.getLogger(__name__)
 # Redis client for heartbeat tracking
 _heartbeat_redis_client: redis.Redis | None = None
 # Redis key TTL - notification is stale if no heartbeat in this time
 HEARTBEAT_TTL_SECONDS = 120  # 2 minutes
 def get_heartbeat_redis_client() -> redis.Redis:
    """Get or create Redis client for heartbeat tracking."""
    global _heartbeat_redis_client
    if _heartbeat_redis_client is None:
        redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
        _heartbeat_redis_client = redis.from_url(redis_url, decode_responses=True)
    return _heartbeat_redis_client
 def _get_heartbeat_key(notification_id: int) -> str:
    """Generate Redis key for notification heartbeat."""
    return f"indexing:heartbeat:{notification_id}"
 router = APIRouter()
@ -1200,6 +1223,16 @@ async def _run_indexing_with_notifications(
                )
            )
            # Set initial Redis heartbeat for stale detection
            if notification:
                try:
                    heartbeat_key = _get_heartbeat_key(notification.id)
                    get_heartbeat_redis_client().setex(
                        heartbeat_key, HEARTBEAT_TTL_SECONDS, "0"
                    )
                except Exception as e:
                    logger.warning(f"Failed to set initial Redis heartbeat: {e}")
        # Update notification to fetching stage
        if notification:
            await NotificationService.connector_indexing.notify_indexing_progress(
@ -1241,6 +1274,17 @@ async def _run_indexing_with_notifications(
            current_indexed_count = indexed_count
            if notification:
                try:
                    # Set Redis heartbeat key with TTL (fast, for stale detection)
                    heartbeat_key = _get_heartbeat_key(notification.id)
                    get_heartbeat_redis_client().setex(
                        heartbeat_key, HEARTBEAT_TTL_SECONDS, str(indexed_count)
                    )
                except Exception as e:
                    # Don't let Redis errors break the indexing
                    logger.warning(f"Failed to set Redis heartbeat: {e}")
                try:
                    # Still update DB notification for progress display
                    await session.refresh(notification)
                    await (
                        NotificationService.connector_indexing.notify_indexing_progress(
@ -1473,6 +1517,14 @@ async def _run_indexing_with_notifications(
                )
            except Exception as notif_error:
                logger.error(f"Failed to update notification: {notif_error!s}")
    finally:
        # Clean up Redis heartbeat key when task completes (success or failure)
        if notification:
            try:
                heartbeat_key = _get_heartbeat_key(notification.id)
                get_heartbeat_redis_client().delete(heartbeat_key)
            except Exception:
                pass  # Ignore cleanup errors - key will expire anyway
 async def run_notion_indexing_with_new_session(
--- a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py
@ -1,18 +1,25 @@
 """Celery task to detect and mark stale connector indexing notifications as failed.
 This task runs periodically (every 5 minutes by default) to find notifications
-that are stuck in "in_progress" status but haven't received a heartbeat update
+that are stuck in "in_progress" status but don't have an active Redis heartbeat key.
-in the configured timeout period. These are marked as "failed" to prevent the
+These are marked as "failed" to prevent the frontend from showing a perpetual "syncing" state.
-frontend from showing a perpetual "syncing" state.
+
 Detection mechanism:
 - Active indexing tasks set a Redis key with TTL (2 minutes) as a heartbeat
 - If the task crashes, the Redis key expires automatically
 - This cleanup task checks for in-progress notifications without a Redis heartbeat key
 - Such notifications are marked as failed with O(1) batch UPDATE
 """
 import json
 import logging
-from datetime import UTC, datetime, timedelta
+import os
 from datetime import UTC, datetime
-from sqlalchemy import and_
+import redis
 from sqlalchemy import and_, text
 from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
 from sqlalchemy.future import select
 from sqlalchemy.orm.attributes import flag_modified
 from sqlalchemy.pool import NullPool
 from app.celery_app import celery_app
@ -21,10 +28,22 @@ from app.db import Notification
 logger = logging.getLogger(__name__)
-# Timeout in minutes - notifications without heartbeat for this long are marked as failed
+# Redis client for checking heartbeats
-# Should be longer than HEARTBEAT_INTERVAL_SECONDS (30s) * a reasonable number of missed heartbeats
+_redis_client: redis.Redis | None = None
-# 5 minutes = 10 missed heartbeats, which is a reasonable threshold
+
-STALE_NOTIFICATION_TIMEOUT_MINUTES = 5
+
 def get_redis_client() -> redis.Redis:
    """Get or create Redis client for heartbeat checking."""
    global _redis_client
    if _redis_client is None:
        redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
        _redis_client = redis.from_url(redis_url, decode_responses=True)
    return _redis_client
 def _get_heartbeat_key(notification_id: int) -> str:
    """Generate Redis key for notification heartbeat."""
    return f"indexing:heartbeat:{notification_id}"
 def get_celery_session_maker():
@ -45,9 +64,9 @@ def cleanup_stale_indexing_notifications_task():
    This task finds notifications that:
    - Have type = 'connector_indexing'
    - Have metadata.status = 'in_progress'
-    - Have updated_at older than STALE_NOTIFICATION_TIMEOUT_MINUTES
+    - Do NOT have a corresponding Redis heartbeat key (meaning task crashed)
-    And marks them as failed with an appropriate error message.
+    And marks them as failed with O(1) batch UPDATE.
    """
    import asyncio
@ -61,84 +80,83 @@ def cleanup_stale_indexing_notifications_task():
 async def _cleanup_stale_notifications():
-    """Find and mark stale connector indexing notifications as failed."""
+    """Find and mark stale connector indexing notifications as failed.
    Uses Redis TTL-based detection:
    1. Find all in-progress notifications
    2. Check which ones are missing their Redis heartbeat key
    3. Mark those as failed with O(1) batch UPDATE using JSONB || operator
    """
    async with get_celery_session_maker()() as session:
        try:
-            # Calculate the cutoff time
+            # Find all in-progress connector indexing notifications
            cutoff_time = datetime.now(UTC) - timedelta(
                minutes=STALE_NOTIFICATION_TIMEOUT_MINUTES
            )
            # Find stale notifications:
            # - type = 'connector_indexing'
            # - metadata->>'status' = 'in_progress'
            # - updated_at < cutoff_time
            result = await session.execute(
-                select(Notification).filter(
+                select(Notification.id).where(
                    and_(
                        Notification.type == "connector_indexing",
                        Notification.notification_metadata["status"].astext
                        == "in_progress",
                        Notification.updated_at < cutoff_time,
                    )
                )
            )
-            stale_notifications = result.scalars().all()
+            in_progress_ids = [row[0] for row in result.fetchall()]
-            if not stale_notifications:
+            if not in_progress_ids:
-                logger.debug("No stale connector indexing notifications found")
+                logger.debug("No in-progress connector indexing notifications found")
                return
            # Check which ones are missing heartbeat keys in Redis
            redis_client = get_redis_client()
            stale_notification_ids = []
            for notification_id in in_progress_ids:
                heartbeat_key = _get_heartbeat_key(notification_id)
                if not redis_client.exists(heartbeat_key):
                    stale_notification_ids.append(notification_id)
            if not stale_notification_ids:
                logger.debug(
                    f"All {len(in_progress_ids)} in-progress notifications have active Redis heartbeats"
                )
                return
            logger.warning(
-                f"Found {len(stale_notifications)} stale connector indexing notifications "
+                f"Found {len(stale_notification_ids)} stale connector indexing notifications "
-                f"(no heartbeat for >{STALE_NOTIFICATION_TIMEOUT_MINUTES} minutes)"
+                f"(no Redis heartbeat key): {stale_notification_ids}"
            )
-            # Mark each stale notification as failed
+            # O(1) Batch UPDATE using JSONB || operator
-            for notification in stale_notifications:
+            # This merges the update data into existing notification_metadata
-                try:
+            # Also updates title and message for proper UI display
-                    # Get current indexed count from metadata if available
+            error_message = (
-                    indexed_count = notification.notification_metadata.get(
+                "Something went wrong while syncing your content. Please retry."
-                        "indexed_count", 0
+            )
                    )
                    connector_name = notification.notification_metadata.get(
                        "connector_name", "Unknown"
                    )
-                    # Calculate how long it's been stale
+            update_data = {
-                    stale_duration = datetime.now(UTC) - notification.updated_at
+                "status": "failed",
-                    stale_minutes = int(stale_duration.total_seconds() / 60)
+                "completed_at": datetime.now(UTC).isoformat(),
                "error_message": error_message,
                "sync_stage": "failed",
            }
-                    # Update notification metadata
+            await session.execute(
-                    notification.notification_metadata["status"] = "failed"
+                text("""
-                    notification.notification_metadata["completed_at"] = datetime.now(
+                    UPDATE notifications 
-                        UTC
+                    SET metadata = metadata || CAST(:update_json AS jsonb),
-                    ).isoformat()
+                        title = 'Failed: ' || COALESCE(metadata->>'connector_name', 'Connector'),
-                    notification.notification_metadata["error_message"] = (
+                        message = :display_message
-                        f"Indexing task appears to have crashed or timed out. "
+                    WHERE id = ANY(:ids)
-                        f"No activity detected for {stale_minutes} minutes. "
+                """),
-                        f"Please try syncing again."
+                {
-                    )
+                    "update_json": json.dumps(update_data),
                    "display_message": f"{error_message}",
                    "ids": stale_notification_ids,
                },
            )
                    # Flag the JSONB column as modified for SQLAlchemy to detect the change
                    flag_modified(notification, "notification_metadata")
                    logger.info(
                        f"Marking notification {notification.id} for connector '{connector_name}' as failed "
                        f"(stale for {stale_minutes} minutes, indexed {indexed_count} items before failure)"
                    )
                except Exception as e:
                    logger.error(
                        f"Error marking notification {notification.id} as failed: {e!s}",
                        exc_info=True,
                    )
                    continue
            # Commit all changes
            await session.commit()
            logger.info(
-                f"Successfully marked {len(stale_notifications)} stale notifications as failed"
+                f"Successfully marked {len(stale_notification_ids)} stale notifications as failed (batch UPDATE)"
            )
        except Exception as e: