Merge branch 'dev' of https://github.com/MODSetter/SurfSense into dev

2026-04-26 09:16:22 +02:00 · 2026-02-01 18:02:27 -08:00 · 2026-02-01 18:02:27 -08:00 · 8301e0169c
commit 8301e0169c
parent d476e18c54 32ab938329
71 changed files with 2889 additions and 732 deletions
--- a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py
@ -0,0 +1,164 @@
+"""Celery task to detect and mark stale connector indexing notifications as failed.
+
+This task runs periodically (every 5 minutes by default) to find notifications
+that are stuck in "in_progress" status but don't have an active Redis heartbeat key.
+These are marked as "failed" to prevent the frontend from showing a perpetual "syncing" state.
+
+Detection mechanism:
+- Active indexing tasks set a Redis key with TTL (2 minutes) as a heartbeat
+- If the task crashes, the Redis key expires automatically
+- This cleanup task checks for in-progress notifications without a Redis heartbeat key
+- Such notifications are marked as failed with O(1) batch UPDATE
+"""
+
+import json
+import logging
+import os
+from datetime import UTC, datetime
+
+import redis
+from sqlalchemy import and_, text
+from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
+from sqlalchemy.future import select
+from sqlalchemy.pool import NullPool
+
+from app.celery_app import celery_app
+from app.config import config
+from app.db import Notification
+
+logger = logging.getLogger(__name__)
+
+# Redis client for checking heartbeats
+_redis_client: redis.Redis | None = None
+
+
+def get_redis_client() -> redis.Redis:
+    """Get or create Redis client for heartbeat checking."""
+    global _redis_client
+    if _redis_client is None:
+        redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
+        _redis_client = redis.from_url(redis_url, decode_responses=True)
+    return _redis_client
+
+
+def _get_heartbeat_key(notification_id: int) -> str:
+    """Generate Redis key for notification heartbeat."""
+    return f"indexing:heartbeat:{notification_id}"
+
+
+def get_celery_session_maker():
+    """Create async session maker for Celery tasks."""
+    engine = create_async_engine(
+        config.DATABASE_URL,
+        poolclass=NullPool,
+        echo=False,
+    )
+    return async_sessionmaker(engine, expire_on_commit=False)
+
+
+@celery_app.task(name="cleanup_stale_indexing_notifications")
+def cleanup_stale_indexing_notifications_task():
+    """
+    Check for stale connector indexing notifications and mark them as failed.
+
+    This task finds notifications that:
+    - Have type = 'connector_indexing'
+    - Have metadata.status = 'in_progress'
+    - Do NOT have a corresponding Redis heartbeat key (meaning task crashed)
+
+    And marks them as failed with O(1) batch UPDATE.
+    """
+    import asyncio
+
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
+    try:
+        loop.run_until_complete(_cleanup_stale_notifications())
+    finally:
+        loop.close()
+
+
+async def _cleanup_stale_notifications():
+    """Find and mark stale connector indexing notifications as failed.
+
+    Uses Redis TTL-based detection:
+    1. Find all in-progress notifications
+    2. Check which ones are missing their Redis heartbeat key
+    3. Mark those as failed with O(1) batch UPDATE using JSONB || operator
+    """
+    async with get_celery_session_maker()() as session:
+        try:
+            # Find all in-progress connector indexing notifications
+            result = await session.execute(
+                select(Notification.id).where(
+                    and_(
+                        Notification.type == "connector_indexing",
+                        Notification.notification_metadata["status"].astext
+                        == "in_progress",
+                    )
+                )
+            )
+            in_progress_ids = [row[0] for row in result.fetchall()]
+
+            if not in_progress_ids:
+                logger.debug("No in-progress connector indexing notifications found")
+                return
+
+            # Check which ones are missing heartbeat keys in Redis
+            redis_client = get_redis_client()
+            stale_notification_ids = []
+
+            for notification_id in in_progress_ids:
+                heartbeat_key = _get_heartbeat_key(notification_id)
+                if not redis_client.exists(heartbeat_key):
+                    stale_notification_ids.append(notification_id)
+
+            if not stale_notification_ids:
+                logger.debug(
+                    f"All {len(in_progress_ids)} in-progress notifications have active Redis heartbeats"
+                )
+                return
+
+            logger.warning(
+                f"Found {len(stale_notification_ids)} stale connector indexing notifications "
+                f"(no Redis heartbeat key): {stale_notification_ids}"
+            )
+
+            # O(1) Batch UPDATE using JSONB || operator
+            # This merges the update data into existing notification_metadata
+            # Also updates title and message for proper UI display
+            error_message = (
+                "Something went wrong while syncing your content. Please retry."
+            )
+
+            update_data = {
+                "status": "failed",
+                "completed_at": datetime.now(UTC).isoformat(),
+                "error_message": error_message,
+                "sync_stage": "failed",
+            }
+
+            await session.execute(
+                text("""
+                    UPDATE notifications 
+                    SET metadata = metadata || CAST(:update_json AS jsonb),
+                        title = 'Failed: ' || COALESCE(metadata->>'connector_name', 'Connector'),
+                        message = :display_message
+                    WHERE id = ANY(:ids)
+                """),
+                {
+                    "update_json": json.dumps(update_data),
+                    "display_message": f"{error_message}",
+                    "ids": stale_notification_ids,
+                },
+            )
+
+            await session.commit()
+            logger.info(
+                f"Successfully marked {len(stale_notification_ids)} stale notifications as failed (batch UPDATE)"
+            )
+
+        except Exception as e:
+            logger.error(f"Error cleaning up stale notifications: {e!s}", exc_info=True)
+            await session.rollback()
--- a/surfsense_backend/app/tasks/composio_indexer.py
+++ b/surfsense_backend/app/tasks/composio_indexer.py
@ -9,6 +9,7 @@ to avoid circular import issues with the connector_indexers package.
 """

 import logging
+from collections.abc import Awaitable, Callable
 from importlib import import_module

 from sqlalchemy.exc import SQLAlchemyError
@ -22,6 +23,9 @@ from app.db import (
 from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_INDEXER
 from app.services.task_logging_service import TaskLoggingService

+# Type alias for heartbeat callback function
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
 # Set up logging
 logger = logging.getLogger(__name__)

@ -86,6 +90,7 @@ async def index_composio_connector(
    end_date: str | None = None,
    update_last_indexed: bool = True,
    max_items: int = 1000,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, int, str | None]:
    """
    Index content from a Composio connector.
@ -102,6 +107,7 @@ async def index_composio_connector(
        end_date: End date for filtering (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp
        max_items: Maximum number of items to fetch
+        on_heartbeat_callback: Optional callback to report progress for heartbeat updates

    Returns:
        Tuple of (number_of_indexed_items, number_of_skipped_items, error_message or None)
@ -180,6 +186,7 @@ async def index_composio_connector(
            "log_entry": log_entry,
            "update_last_indexed": update_last_indexed,
            "max_items": max_items,
+            "on_heartbeat_callback": on_heartbeat_callback,
        }

        # Add date params for toolkits that support them
--- a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py
@ -2,6 +2,9 @@
 Airtable connector indexer.
 """

+import time
+from collections.abc import Awaitable, Callable
+
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession

@ -27,6 +30,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_airtable_records(
    session: AsyncSession,
@ -37,6 +46,7 @@ async def index_airtable_records(
    end_date: str | None = None,
    max_records: int = 2500,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Airtable records for a given connector.
@ -50,6 +60,7 @@ async def index_airtable_records(
        end_date: End date for filtering records (YYYY-MM-DD)
        max_records: Maximum number of records to fetch per table
        update_last_indexed: Whether to update the last_indexed_at timestamp
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple of (number_of_documents_processed, error_message)
@ -127,8 +138,20 @@ async def index_airtable_records(

            logger.info(f"Found {len(bases)} Airtable bases to process")

+            # Heartbeat tracking - update notification periodically to prevent appearing stuck
+            last_heartbeat_time = time.time()
+            total_documents_indexed = 0
+
            # Process each base
            for base in bases:
+                # Check if it's time for a heartbeat update
+                if (
+                    on_heartbeat_callback
+                    and (time.time() - last_heartbeat_time)
+                    >= HEARTBEAT_INTERVAL_SECONDS
+                ):
+                    await on_heartbeat_callback(total_documents_indexed)
+                    last_heartbeat_time = time.time()
                base_id = base.get("id")
                base_name = base.get("name", "Unknown Base")

@ -204,6 +227,15 @@ async def index_airtable_records(
                    documents_skipped = 0
                    # Process each record
                    for record in records:
+                        # Check if it's time for a heartbeat update
+                        if (
+                            on_heartbeat_callback
+                            and (time.time() - last_heartbeat_time)
+                            >= HEARTBEAT_INTERVAL_SECONDS
+                        ):
+                            await on_heartbeat_callback(total_documents_indexed)
+                            last_heartbeat_time = time.time()
+
                        try:
                            # Generate markdown content
                            markdown_content = (
--- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
@ -2,6 +2,8 @@
 BookStack connector indexer.
 """

+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime

 from sqlalchemy.exc import SQLAlchemyError
@ -29,6 +31,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_bookstack_pages(
    session: AsyncSession,
@ -38,6 +46,7 @@ async def index_bookstack_pages(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index BookStack pages.
@ -50,6 +59,7 @@ async def index_bookstack_pages(
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -179,7 +189,17 @@ async def index_bookstack_pages(
        skipped_pages = []
        documents_skipped = 0

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        for page in pages:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
            try:
                page_id = page.get("id")
                page_name = page.get("name", "")
--- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py
@ -3,6 +3,8 @@ ClickUp connector indexer.
 """

 import contextlib
+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime

 from sqlalchemy.exc import SQLAlchemyError
@ -29,6 +31,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_clickup_tasks(
    session: AsyncSession,
@ -38,6 +46,7 @@ async def index_clickup_tasks(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index tasks from ClickUp workspace.
@ -50,6 +59,7 @@ async def index_clickup_tasks(
        start_date: Start date for filtering tasks (YYYY-MM-DD format)
        end_date: End date for filtering tasks (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple of (number of indexed tasks, error message if any)
@ -132,6 +142,9 @@ async def index_clickup_tasks(
        documents_indexed = 0
        documents_skipped = 0

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        # Iterate workspaces and fetch tasks
        for workspace in workspaces:
            workspace_id = workspace.get("id")
@ -170,6 +183,15 @@ async def index_clickup_tasks(
            )

            for task in tasks:
+                # Check if it's time for a heartbeat update
+                if (
+                    on_heartbeat_callback
+                    and (time.time() - last_heartbeat_time)
+                    >= HEARTBEAT_INTERVAL_SECONDS
+                ):
+                    await on_heartbeat_callback(documents_indexed)
+                    last_heartbeat_time = time.time()
+
                try:
                    task_id = task.get("id")
                    task_name = task.get("name", "Untitled Task")
--- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py
@ -3,6 +3,8 @@ Confluence connector indexer.
 """

 import contextlib
+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime

 from sqlalchemy.exc import SQLAlchemyError
@ -30,6 +32,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_confluence_pages(
    session: AsyncSession,
@ -39,6 +47,7 @@ async def index_confluence_pages(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Confluence pages and comments.
@ -51,6 +60,7 @@ async def index_confluence_pages(
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -175,7 +185,17 @@ async def index_confluence_pages(
        skipped_pages = []
        documents_skipped = 0

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        for page in pages:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
            try:
                page_id = page.get("id")
                page_title = page.get("title", "")
--- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py
@ -3,6 +3,8 @@ Discord connector indexer.
 """

 import asyncio
+import time
+from collections.abc import Awaitable, Callable
 from datetime import UTC, datetime, timedelta

 from sqlalchemy.exc import SQLAlchemyError
@ -28,6 +30,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds - update notification every 30 seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_discord_messages(
    session: AsyncSession,
@ -37,6 +45,7 @@ async def index_discord_messages(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Discord messages from all accessible channels.
@ -49,6 +58,8 @@ async def index_discord_messages(
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.
+            Called periodically with (indexed_count) to prevent task appearing stuck.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -281,6 +292,9 @@ async def index_discord_messages(
        documents_skipped = 0
        skipped_channels: list[str] = []

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        # Process each guild and channel
        await task_logger.log_task_progress(
            log_entry,
@ -290,6 +304,14 @@ async def index_discord_messages(

        try:
            for guild in guilds:
+                # Check if it's time for a heartbeat update
+                if (
+                    on_heartbeat_callback
+                    and (time.time() - last_heartbeat_time)
+                    >= HEARTBEAT_INTERVAL_SECONDS
+                ):
+                    await on_heartbeat_callback(documents_indexed)
+                    last_heartbeat_time = time.time()
                guild_id = guild["id"]
                guild_name = guild["name"]
                logger.info(f"Processing guild: {guild_name} ({guild_id})")
--- a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py
@ -4,6 +4,8 @@ Elasticsearch indexer for SurfSense

 import json
 import logging
+import time
+from collections.abc import Awaitable, Callable
 from datetime import UTC, datetime
 from typing import Any

@ -25,6 +27,12 @@ from .base import (
    get_current_timestamp,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+
 logger = logging.getLogger(__name__)


@ -36,6 +44,7 @@ async def index_elasticsearch_documents(
    start_date: str,
    end_date: str,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index documents from Elasticsearch into SurfSense
@ -48,6 +57,7 @@ async def index_elasticsearch_documents(
        start_date: Start date for indexing (not used for Elasticsearch, kept for compatibility)
        end_date: End date for indexing (not used for Elasticsearch, kept for compatibility)
        update_last_indexed: Whether to update the last indexed timestamp
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple of (number of documents processed, error message if any)
@ -155,6 +165,9 @@ async def index_elasticsearch_documents(

        documents_processed = 0

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        try:
            await task_logger.log_task_progress(
                log_entry,
@ -172,6 +185,15 @@ async def index_elasticsearch_documents(
                size=min(max_documents, 100),  # Scroll in batches
                fields=config.get("ELASTICSEARCH_FIELDS"),
            ):
+                # Check if it's time for a heartbeat update
+                if (
+                    on_heartbeat_callback
+                    and (time.time() - last_heartbeat_time)
+                    >= HEARTBEAT_INTERVAL_SECONDS
+                ):
+                    await on_heartbeat_callback(documents_processed)
+                    last_heartbeat_time = time.time()
+
                if documents_processed >= max_documents:
                    break

--- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py
@ -5,6 +5,8 @@ This indexer processes entire repository digests in one pass, dramatically
 reducing LLM API calls compared to the previous file-by-file approach.
 """

+import time
+from collections.abc import Awaitable, Callable
 from datetime import UTC, datetime

 from sqlalchemy.exc import SQLAlchemyError
@ -30,6 +32,12 @@ from .base import (
    logger,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds - update notification every 30 seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+
 # Maximum tokens for a single digest before splitting
 # Most LLMs can handle 128k+ tokens now, but we'll be conservative
 MAX_DIGEST_CHARS = 500_000  # ~125k tokens
@ -43,6 +51,7 @@ async def index_github_repos(
    start_date: str | None = None,  # Ignored - GitHub indexes full repo snapshots
    end_date: str | None = None,  # Ignored - GitHub indexes full repo snapshots
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index GitHub repositories using gitingest for efficient processing.
@ -62,6 +71,7 @@ async def index_github_repos(
        start_date: Ignored - kept for API compatibility
        end_date: Ignored - kept for API compatibility
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -168,7 +178,18 @@ async def index_github_repos(
            f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
        )

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+        documents_indexed = 0
+
        for repo_full_name in repo_full_names_to_index:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
            if not repo_full_name or not isinstance(repo_full_name, str):
                logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
                continue
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@ -2,10 +2,10 @@
 Google Calendar connector indexer.
 """

+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime, timedelta

-import pytz
-from dateutil.parser import isoparse
 from google.oauth2.credentials import Credentials
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
@ -30,6 +30,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_google_calendar_events(
    session: AsyncSession,
@ -39,6 +45,7 @@ async def index_google_calendar_events(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Google Calendar events.
@ -52,6 +59,7 @@ async def index_google_calendar_events(
        end_date: End date for indexing (YYYY-MM-DD format). Can be in the future to index upcoming events.
                  Defaults to today if not provided.
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -281,7 +289,17 @@ async def index_google_calendar_events(
            0  # Track events skipped due to duplicate content_hash
        )

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        for event in events:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
            try:
                event_id = event.get("id")
                event_summary = event.get("summary", "No Title")
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@ -1,6 +1,8 @@
 """Google Drive indexer using Surfsense file processors."""

 import logging
+import time
+from collections.abc import Awaitable, Callable

 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
@ -24,6 +26,12 @@ from app.tasks.connector_indexers.base import (
 )
 from app.utils.document_converters import generate_unique_identifier_hash

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+
 logger = logging.getLogger(__name__)


@ -38,6 +46,7 @@ async def index_google_drive_files(
    update_last_indexed: bool = True,
    max_files: int = 500,
    include_subfolders: bool = False,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Google Drive files for a specific connector.
@ -53,6 +62,7 @@ async def index_google_drive_files(
        update_last_indexed: Whether to update last_indexed_at timestamp
        max_files: Maximum number of files to index
        include_subfolders: Whether to recursively index files in subfolders
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple of (number_of_indexed_files, error_message)
@ -147,6 +157,7 @@ async def index_google_drive_files(
                log_entry=log_entry,
                max_files=max_files,
                include_subfolders=include_subfolders,
+                on_heartbeat_callback=on_heartbeat_callback,
            )
        else:
            logger.info(f"Using full scan for connector {connector_id}")
@ -163,6 +174,7 @@ async def index_google_drive_files(
                log_entry=log_entry,
                max_files=max_files,
                include_subfolders=include_subfolders,
+                on_heartbeat_callback=on_heartbeat_callback,
            )

        documents_indexed, documents_skipped = result
@ -383,6 +395,7 @@ async def _index_full_scan(
    log_entry: any,
    max_files: int,
    include_subfolders: bool = False,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, int]:
    """Perform full scan indexing of a folder."""
    await task_logger.log_task_progress(
@ -399,10 +412,20 @@ async def _index_full_scan(
    documents_skipped = 0
    files_processed = 0

+    # Heartbeat tracking - update notification periodically to prevent appearing stuck
+    last_heartbeat_time = time.time()
+
    # Queue of folders to process: (folder_id, folder_name)
    folders_to_process = [(folder_id, folder_name)]

    while folders_to_process and files_processed < max_files:
+        # Check if it's time for a heartbeat update
+        if (
+            on_heartbeat_callback
+            and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+        ):
+            await on_heartbeat_callback(documents_indexed)
+            last_heartbeat_time = time.time()
        current_folder_id, current_folder_name = folders_to_process.pop(0)
        logger.info(f"Processing folder: {current_folder_name} ({current_folder_id})")
        page_token = None
@ -485,6 +508,7 @@ async def _index_with_delta_sync(
    log_entry: any,
    max_files: int,
    include_subfolders: bool = False,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, int]:
    """Perform delta sync indexing using change tracking.

@ -515,7 +539,17 @@ async def _index_with_delta_sync(
    documents_skipped = 0
    files_processed = 0

+    # Heartbeat tracking - update notification periodically to prevent appearing stuck
+    last_heartbeat_time = time.time()
+
    for change in changes:
+        # Check if it's time for a heartbeat update
+        if (
+            on_heartbeat_callback
+            and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+        ):
+            await on_heartbeat_callback(documents_indexed)
+            last_heartbeat_time = time.time()
        if files_processed >= max_files:
            break

--- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
@ -2,6 +2,8 @@
 Google Gmail connector indexer.
 """

+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime

 from google.oauth2.credentials import Credentials
@ -33,6 +35,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_google_gmail_messages(
    session: AsyncSession,
@ -43,6 +51,7 @@ async def index_google_gmail_messages(
    end_date: str | None = None,
    update_last_indexed: bool = True,
    max_messages: int = 1000,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str]:
    """
    Index Gmail messages for a specific connector.
@ -56,6 +65,7 @@ async def index_google_gmail_messages(
        end_date: End date for filtering messages (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
        max_messages: Maximum number of messages to fetch (default: 100)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple of (number_of_indexed_messages, status_message)
@ -212,7 +222,18 @@ async def index_google_gmail_messages(
        documents_indexed = 0
        skipped_messages = []
        documents_skipped = 0
+
+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        for message in messages:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
            try:
                # Extract message information
                message_id = message.get("id", "")
--- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py
@ -3,6 +3,8 @@ Jira connector indexer.
 """

 import contextlib
+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime

 from sqlalchemy.exc import SQLAlchemyError
@ -30,6 +32,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds - update notification every 30 seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_jira_issues(
    session: AsyncSession,
@ -39,6 +47,7 @@ async def index_jira_issues(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Jira issues and comments.
@ -51,6 +60,7 @@ async def index_jira_issues(
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -169,7 +179,17 @@ async def index_jira_issues(
        skipped_issues = []
        documents_skipped = 0

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        for issue in issues:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
            try:
                issue_id = issue.get("key")
                issue_identifier = issue.get("key", "")
--- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py
@ -2,6 +2,8 @@
 Linear connector indexer.
 """

+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime

 from sqlalchemy.exc import SQLAlchemyError
@ -29,6 +31,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds - update notification every 30 seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_linear_issues(
    session: AsyncSession,
@ -38,6 +46,7 @@ async def index_linear_issues(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Linear issues and comments.
@ -50,6 +59,7 @@ async def index_linear_issues(
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -188,6 +198,9 @@ async def index_linear_issues(
        documents_skipped = 0
        skipped_issues = []

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        await task_logger.log_task_progress(
            log_entry,
            f"Starting to process {len(issues)} Linear issues",
@ -196,6 +209,14 @@ async def index_linear_issues(

        # Process each issue
        for issue in issues:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
+
            try:
                issue_id = issue.get("id", "")
                issue_identifier = issue.get("identifier", "")
--- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py
@ -2,6 +2,8 @@
 Luma connector indexer.
 """

+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime, timedelta

 from sqlalchemy.exc import SQLAlchemyError
@ -28,6 +30,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_luma_events(
    session: AsyncSession,
@ -37,6 +45,7 @@ async def index_luma_events(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Luma events.
@ -50,6 +59,7 @@ async def index_luma_events(
        end_date: End date for indexing (YYYY-MM-DD format). Can be in the future to index upcoming events.
                  Defaults to today if not provided.
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -221,7 +231,17 @@ async def index_luma_events(
        documents_skipped = 0
        skipped_events = []

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        for event in events:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
            try:
                # Luma event structure fields - events have nested 'event' field
                event_data = event.get("event", {})
--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@ -2,6 +2,7 @@
 Notion connector indexer.
 """

+import time
 from collections.abc import Awaitable, Callable
 from datetime import datetime

@ -34,6 +35,13 @@ from .base import (
 # Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds) -> None
 RetryCallbackType = Callable[[str, int, int, float], Awaitable[None]]

+# Type alias for heartbeat callback
+# Signature: async callback(indexed_count) -> None
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds - update notification every 30 seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_notion_pages(
    session: AsyncSession,
@ -44,6 +52,7 @@ async def index_notion_pages(
    end_date: str | None = None,
    update_last_indexed: bool = True,
    on_retry_callback: RetryCallbackType | None = None,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Notion pages from all accessible pages.
@ -59,6 +68,8 @@ async def index_notion_pages(
        on_retry_callback: Optional callback for retry progress notifications.
            Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds)
            retry_reason is one of: 'rate_limit', 'server_error', 'timeout'
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.
+            Called periodically with (indexed_count) to prevent task appearing stuck.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -211,6 +222,9 @@ async def index_notion_pages(
        documents_skipped = 0
        skipped_pages = []

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        await task_logger.log_task_progress(
            log_entry,
            f"Starting to process {len(pages)} Notion pages",
@ -219,6 +233,14 @@ async def index_notion_pages(

        # Process each page
        for page in pages:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
+
            try:
                page_id = page.get("page_id")
                page_title = page.get("title", f"Untitled page ({page_id})")
--- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py
@ -7,6 +7,8 @@ This connector is only available in self-hosted mode.

 import os
 import re
+import time
+from collections.abc import Awaitable, Callable
 from datetime import UTC, datetime
 from pathlib import Path

@ -35,6 +37,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 def parse_frontmatter(content: str) -> tuple[dict | None, str]:
    """
@ -152,6 +160,7 @@ async def index_obsidian_vault(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index notes from a local Obsidian vault.
@ -167,6 +176,7 @@ async def index_obsidian_vault(
        start_date: Start date for filtering (YYYY-MM-DD format) - optional
        end_date: End date for filtering (YYYY-MM-DD format) - optional
        update_last_indexed: Whether to update the last_indexed_at timestamp
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -305,7 +315,17 @@ async def index_obsidian_vault(
        indexed_count = 0
        skipped_count = 0

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        for file_info in files:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(indexed_count)
+                last_heartbeat_time = time.time()
            try:
                file_path = file_info["path"]
                relative_path = file_info["relative_path"]
--- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py
@ -2,6 +2,8 @@
 Slack connector indexer.
 """

+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime

 from slack_sdk.errors import SlackApiError
@ -29,6 +31,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds - update notification every 30 seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_slack_messages(
    session: AsyncSession,
@ -38,6 +46,7 @@ async def index_slack_messages(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Slack messages from all accessible channels.
@ -50,6 +59,8 @@ async def index_slack_messages(
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.
+            Called periodically with (indexed_count) to prevent task appearing stuck.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -164,6 +175,9 @@ async def index_slack_messages(
        documents_skipped = 0
        skipped_channels = []

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        await task_logger.log_task_progress(
            log_entry,
            f"Starting to process {len(channels)} Slack channels",
@ -172,6 +186,13 @@ async def index_slack_messages(

        # Process each channel
        for channel_obj in channels:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
            channel_id = channel_obj["id"]
            channel_name = channel_obj["name"]
            is_private = channel_obj["is_private"]
--- a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py
@ -2,6 +2,8 @@
 Microsoft Teams connector indexer.
 """

+import time
+from collections.abc import Awaitable, Callable
 from datetime import UTC

 from sqlalchemy.exc import SQLAlchemyError
@ -28,6 +30,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds - update notification every 30 seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_teams_messages(
    session: AsyncSession,
@ -37,6 +45,7 @@ async def index_teams_messages(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index Microsoft Teams messages from all accessible teams and channels.
@ -49,6 +58,8 @@ async def index_teams_messages(
        start_date: Start date for indexing (YYYY-MM-DD format)
        end_date: End date for indexing (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.
+            Called periodically with (indexed_count) to prevent task appearing stuck.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -161,6 +172,9 @@ async def index_teams_messages(
        documents_skipped = 0
        skipped_channels = []

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        await task_logger.log_task_progress(
            log_entry,
            f"Starting to process {len(teams)} Teams",
@ -185,6 +199,14 @@ async def index_teams_messages(

        # Process each team
        for team in teams:
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
+
            team_id = team.get("id")
            team_name = team.get("displayName", "Unknown Team")

--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@ -2,6 +2,8 @@
 Webcrawler connector indexer.
 """

+import time
+from collections.abc import Awaitable, Callable
 from datetime import datetime

 from sqlalchemy.exc import SQLAlchemyError
@ -29,6 +31,12 @@ from .base import (
    update_connector_last_indexed,
 )

+# Type hint for heartbeat callback
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+# Heartbeat interval in seconds
+HEARTBEAT_INTERVAL_SECONDS = 30
+

 async def index_crawled_urls(
    session: AsyncSession,
@ -38,6 +46,7 @@ async def index_crawled_urls(
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
+    on_heartbeat_callback: HeartbeatCallbackType | None = None,
 ) -> tuple[int, str | None]:
    """
    Index web page URLs.
@ -50,6 +59,7 @@ async def index_crawled_urls(
        start_date: Start date for filtering (YYYY-MM-DD format) - optional
        end_date: End date for filtering (YYYY-MM-DD format) - optional
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
+        on_heartbeat_callback: Optional callback to update notification during long-running indexing.

    Returns:
        Tuple containing (number of documents indexed, error message or None)
@ -140,7 +150,17 @@ async def index_crawled_urls(
        documents_skipped = 0
        failed_urls = []

+        # Heartbeat tracking - update notification periodically to prevent appearing stuck
+        last_heartbeat_time = time.time()
+
        for idx, url in enumerate(urls, 1):
+            # Check if it's time for a heartbeat update
+            if (
+                on_heartbeat_callback
+                and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
+            ):
+                await on_heartbeat_callback(documents_indexed)
+                last_heartbeat_time = time.time()
            try:
                logger.info(f"Processing URL {idx}/{len(urls)}: {url}")