feat: add heartbeat callback support for long-running indexing tasks and implement stale notification cleanup task

This commit is contained in:
Anish Sarkar 2026-02-01 02:17:06 +05:30
parent e5f7e87f42
commit 024a683b4f
27 changed files with 685 additions and 7 deletions

View file

@ -0,0 +1,141 @@
"""Celery task to detect and mark stale connector indexing notifications as failed.
This task runs periodically (every 5 minutes by default) to find notifications
that are stuck in "in_progress" status but haven't received a heartbeat update
in the configured timeout period. These are marked as "failed" to prevent the
frontend from showing a perpetual "syncing" state.
"""
import logging
from datetime import UTC, datetime, timedelta
from sqlalchemy import and_, update
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.future import select
from sqlalchemy.orm.attributes import flag_modified
from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.db import Notification
logger = logging.getLogger(__name__)
# Timeout in minutes - notifications without heartbeat for this long are marked as failed
# Should be longer than HEARTBEAT_INTERVAL_SECONDS (30s) * a reasonable number of missed heartbeats
# 5 minutes = 10 missed heartbeats, which is a reasonable threshold
STALE_NOTIFICATION_TIMEOUT_MINUTES = 5
def get_celery_session_maker():
"""Create async session maker for Celery tasks."""
engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool,
echo=False,
)
return async_sessionmaker(engine, expire_on_commit=False)
@celery_app.task(name="cleanup_stale_indexing_notifications")
def cleanup_stale_indexing_notifications_task():
"""
Check for stale connector indexing notifications and mark them as failed.
This task finds notifications that:
- Have type = 'connector_indexing'
- Have metadata.status = 'in_progress'
- Have updated_at older than STALE_NOTIFICATION_TIMEOUT_MINUTES
And marks them as failed with an appropriate error message.
"""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(_cleanup_stale_notifications())
finally:
loop.close()
async def _cleanup_stale_notifications():
"""Find and mark stale connector indexing notifications as failed."""
async with get_celery_session_maker()() as session:
try:
# Calculate the cutoff time
cutoff_time = datetime.now(UTC) - timedelta(
minutes=STALE_NOTIFICATION_TIMEOUT_MINUTES
)
# Find stale notifications:
# - type = 'connector_indexing'
# - metadata->>'status' = 'in_progress'
# - updated_at < cutoff_time
result = await session.execute(
select(Notification).filter(
and_(
Notification.type == "connector_indexing",
Notification.notification_metadata["status"].astext == "in_progress",
Notification.updated_at < cutoff_time,
)
)
)
stale_notifications = result.scalars().all()
if not stale_notifications:
logger.debug("No stale connector indexing notifications found")
return
logger.warning(
f"Found {len(stale_notifications)} stale connector indexing notifications "
f"(no heartbeat for >{STALE_NOTIFICATION_TIMEOUT_MINUTES} minutes)"
)
# Mark each stale notification as failed
for notification in stale_notifications:
try:
# Get current indexed count from metadata if available
indexed_count = notification.notification_metadata.get("indexed_count", 0)
connector_name = notification.notification_metadata.get("connector_name", "Unknown")
# Calculate how long it's been stale
stale_duration = datetime.now(UTC) - notification.updated_at
stale_minutes = int(stale_duration.total_seconds() / 60)
# Update notification metadata
notification.notification_metadata["status"] = "failed"
notification.notification_metadata["completed_at"] = datetime.now(UTC).isoformat()
notification.notification_metadata["error_message"] = (
f"Indexing task appears to have crashed or timed out. "
f"No activity detected for {stale_minutes} minutes. "
f"Please try syncing again."
)
# Flag the JSONB column as modified for SQLAlchemy to detect the change
flag_modified(notification, "notification_metadata")
logger.info(
f"Marking notification {notification.id} for connector '{connector_name}' as failed "
f"(stale for {stale_minutes} minutes, indexed {indexed_count} items before failure)"
)
except Exception as e:
logger.error(
f"Error marking notification {notification.id} as failed: {e!s}",
exc_info=True,
)
continue
# Commit all changes
await session.commit()
logger.info(
f"Successfully marked {len(stale_notifications)} stale notifications as failed"
)
except Exception as e:
logger.error(f"Error cleaning up stale notifications: {e!s}", exc_info=True)
await session.rollback()

View file

@ -9,8 +9,12 @@ to avoid circular import issues with the connector_indexers package.
"""
import logging
from collections.abc import Awaitable, Callable
from importlib import import_module
# Type alias for heartbeat callback function
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
@ -86,6 +90,7 @@ async def index_composio_connector(
end_date: str | None = None,
update_last_indexed: bool = True,
max_items: int = 1000,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int, str | None]:
"""
Index content from a Composio connector.
@ -102,6 +107,7 @@ async def index_composio_connector(
end_date: End date for filtering (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp
max_items: Maximum number of items to fetch
on_heartbeat_callback: Optional callback to report progress for heartbeat updates
Returns:
Tuple of (number_of_indexed_items, number_of_skipped_items, error_message or None)
@ -180,6 +186,7 @@ async def index_composio_connector(
"log_entry": log_entry,
"update_last_indexed": update_last_indexed,
"max_items": max_items,
"on_heartbeat_callback": on_heartbeat_callback,
}
# Add date params for toolkits that support them

View file

@ -2,6 +2,9 @@
Airtable connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@ -17,6 +20,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
calculate_date_range,
check_document_by_unique_identifier,
@ -37,6 +46,7 @@ async def index_airtable_records(
end_date: str | None = None,
max_records: int = 2500,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Airtable records for a given connector.
@ -50,6 +60,7 @@ async def index_airtable_records(
end_date: End date for filtering records (YYYY-MM-DD)
max_records: Maximum number of records to fetch per table
update_last_indexed: Whether to update the last_indexed_at timestamp
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number_of_documents_processed, error_message)
@ -127,8 +138,16 @@ async def index_airtable_records(
logger.info(f"Found {len(bases)} Airtable bases to process")
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
total_documents_indexed = 0
# Process each base
for base in bases:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(total_documents_indexed)
last_heartbeat_time = time.time()
base_id = base.get("id")
base_name = base.get("name", "Unknown Base")
@ -204,6 +223,11 @@ async def index_airtable_records(
documents_skipped = 0
# Process each record
for record in records:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(total_documents_indexed)
last_heartbeat_time = time.time()
try:
# Generate markdown content
markdown_content = (

View file

@ -2,6 +2,8 @@
BookStack connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -19,6 +21,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
calculate_date_range,
check_document_by_unique_identifier,
@ -38,6 +46,7 @@ async def index_bookstack_pages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index BookStack pages.
@ -50,6 +59,7 @@ async def index_bookstack_pages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -179,7 +189,14 @@ async def index_bookstack_pages(
skipped_pages = []
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for page in pages:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
page_id = page.get("id")
page_name = page.get("name", "")

View file

@ -3,6 +3,8 @@ ClickUp connector indexer.
"""
import contextlib
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -20,6 +22,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
check_document_by_unique_identifier,
check_duplicate_document_by_hash,
@ -38,6 +46,7 @@ async def index_clickup_tasks(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index tasks from ClickUp workspace.
@ -50,6 +59,7 @@ async def index_clickup_tasks(
start_date: Start date for filtering tasks (YYYY-MM-DD format)
end_date: End date for filtering tasks (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number of indexed tasks, error message if any)
@ -132,6 +142,9 @@ async def index_clickup_tasks(
documents_indexed = 0
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# Iterate workspaces and fetch tasks
for workspace in workspaces:
workspace_id = workspace.get("id")
@ -170,6 +183,11 @@ async def index_clickup_tasks(
)
for task in tasks:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
task_id = task.get("id")
task_name = task.get("name", "Untitled Task")

View file

@ -3,6 +3,8 @@ Confluence connector indexer.
"""
import contextlib
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -20,6 +22,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
calculate_date_range,
check_document_by_unique_identifier,
@ -39,6 +47,7 @@ async def index_confluence_pages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Confluence pages and comments.
@ -51,6 +60,7 @@ async def index_confluence_pages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -175,7 +185,14 @@ async def index_confluence_pages(
skipped_pages = []
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for page in pages:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
page_id = page.get("id")
page_title = page.get("title", "")

View file

@ -3,6 +3,8 @@ Discord connector indexer.
"""
import asyncio
import time
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime, timedelta
from sqlalchemy.exc import SQLAlchemyError
@ -28,6 +30,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_discord_messages(
session: AsyncSession,
@ -37,6 +45,7 @@ async def index_discord_messages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Discord messages from all accessible channels.
@ -49,6 +58,8 @@ async def index_discord_messages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Called periodically with (indexed_count) to prevent task appearing stuck.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -281,6 +292,9 @@ async def index_discord_messages(
documents_skipped = 0
skipped_channels: list[str] = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# Process each guild and channel
await task_logger.log_task_progress(
log_entry,
@ -290,6 +304,10 @@ async def index_discord_messages(
try:
for guild in guilds:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
guild_id = guild["id"]
guild_name = guild["name"]
logger.info(f"Processing guild: {guild_name} ({guild_id})")

View file

@ -4,6 +4,8 @@ Elasticsearch indexer for SurfSense
import json
import logging
import time
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime
from typing import Any
@ -19,6 +21,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
check_document_by_unique_identifier,
check_duplicate_document_by_hash,
@ -36,6 +44,7 @@ async def index_elasticsearch_documents(
start_date: str,
end_date: str,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index documents from Elasticsearch into SurfSense
@ -48,6 +57,7 @@ async def index_elasticsearch_documents(
start_date: Start date for indexing (not used for Elasticsearch, kept for compatibility)
end_date: End date for indexing (not used for Elasticsearch, kept for compatibility)
update_last_indexed: Whether to update the last indexed timestamp
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number of documents processed, error message if any)
@ -155,6 +165,9 @@ async def index_elasticsearch_documents(
documents_processed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
try:
await task_logger.log_task_progress(
log_entry,
@ -172,6 +185,11 @@ async def index_elasticsearch_documents(
size=min(max_documents, 100), # Scroll in batches
fields=config.get("ELASTICSEARCH_FIELDS"),
):
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_processed)
last_heartbeat_time = time.time()
if documents_processed >= max_documents:
break

View file

@ -5,6 +5,8 @@ This indexer processes entire repository digests in one pass, dramatically
reducing LLM API calls compared to the previous file-by-file approach.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime
from sqlalchemy.exc import SQLAlchemyError
@ -22,6 +24,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
check_document_by_unique_identifier,
check_duplicate_document_by_hash,
@ -43,6 +51,7 @@ async def index_github_repos(
start_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
end_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index GitHub repositories using gitingest for efficient processing.
@ -62,6 +71,7 @@ async def index_github_repos(
start_date: Ignored - kept for API compatibility
end_date: Ignored - kept for API compatibility
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -168,7 +178,15 @@ async def index_github_repos(
f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
)
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
documents_indexed = 0
for repo_full_name in repo_full_names_to_index:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
if not repo_full_name or not isinstance(repo_full_name, str):
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
continue

View file

@ -2,6 +2,8 @@
Google Calendar connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime, timedelta
import pytz
@ -21,6 +23,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
check_document_by_unique_identifier,
check_duplicate_document_by_hash,
@ -39,6 +47,7 @@ async def index_google_calendar_events(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Google Calendar events.
@ -52,6 +61,7 @@ async def index_google_calendar_events(
end_date: End date for indexing (YYYY-MM-DD format). Can be in the future to index upcoming events.
Defaults to today if not provided.
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -281,7 +291,14 @@ async def index_google_calendar_events(
0 # Track events skipped due to duplicate content_hash
)
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for event in events:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
event_id = event.get("id")
event_summary = event.get("summary", "No Title")

View file

@ -1,6 +1,8 @@
"""Google Drive indexer using Surfsense file processors."""
import logging
import time
from collections.abc import Awaitable, Callable
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@ -24,6 +26,12 @@ from app.tasks.connector_indexers.base import (
)
from app.utils.document_converters import generate_unique_identifier_hash
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
logger = logging.getLogger(__name__)
@ -38,6 +46,7 @@ async def index_google_drive_files(
update_last_indexed: bool = True,
max_files: int = 500,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Google Drive files for a specific connector.
@ -53,6 +62,7 @@ async def index_google_drive_files(
update_last_indexed: Whether to update last_indexed_at timestamp
max_files: Maximum number of files to index
include_subfolders: Whether to recursively index files in subfolders
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number_of_indexed_files, error_message)
@ -147,6 +157,7 @@ async def index_google_drive_files(
log_entry=log_entry,
max_files=max_files,
include_subfolders=include_subfolders,
on_heartbeat_callback=on_heartbeat_callback,
)
else:
logger.info(f"Using full scan for connector {connector_id}")
@ -163,6 +174,7 @@ async def index_google_drive_files(
log_entry=log_entry,
max_files=max_files,
include_subfolders=include_subfolders,
on_heartbeat_callback=on_heartbeat_callback,
)
documents_indexed, documents_skipped = result
@ -383,6 +395,7 @@ async def _index_full_scan(
log_entry: any,
max_files: int,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int]:
"""Perform full scan indexing of a folder."""
await task_logger.log_task_progress(
@ -399,10 +412,17 @@ async def _index_full_scan(
documents_skipped = 0
files_processed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# Queue of folders to process: (folder_id, folder_name)
folders_to_process = [(folder_id, folder_name)]
while folders_to_process and files_processed < max_files:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
current_folder_id, current_folder_name = folders_to_process.pop(0)
logger.info(f"Processing folder: {current_folder_name} ({current_folder_id})")
page_token = None
@ -485,6 +505,7 @@ async def _index_with_delta_sync(
log_entry: any,
max_files: int,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int]:
"""Perform delta sync indexing using change tracking.
@ -515,7 +536,14 @@ async def _index_with_delta_sync(
documents_skipped = 0
files_processed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for change in changes:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
if files_processed >= max_files:
break

View file

@ -2,6 +2,8 @@
Google Gmail connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from google.oauth2.credentials import Credentials
@ -23,6 +25,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
calculate_date_range,
check_document_by_unique_identifier,
@ -43,6 +51,7 @@ async def index_google_gmail_messages(
end_date: str | None = None,
update_last_indexed: bool = True,
max_messages: int = 1000,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str]:
"""
Index Gmail messages for a specific connector.
@ -56,6 +65,7 @@ async def index_google_gmail_messages(
end_date: End date for filtering messages (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
max_messages: Maximum number of messages to fetch (default: 100)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number_of_indexed_messages, status_message)
@ -212,7 +222,15 @@ async def index_google_gmail_messages(
documents_indexed = 0
skipped_messages = []
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for message in messages:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
# Extract message information
message_id = message.get("id", "")

View file

@ -3,6 +3,8 @@ Jira connector indexer.
"""
import contextlib
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -20,6 +22,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
calculate_date_range,
check_document_by_unique_identifier,
@ -39,6 +47,7 @@ async def index_jira_issues(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Jira issues and comments.
@ -51,6 +60,7 @@ async def index_jira_issues(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -169,7 +179,14 @@ async def index_jira_issues(
skipped_issues = []
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for issue in issues:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
issue_id = issue.get("key")
issue_identifier = issue.get("key", "")

View file

@ -2,6 +2,8 @@
Linear connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -19,6 +21,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
calculate_date_range,
check_document_by_unique_identifier,
@ -38,6 +46,7 @@ async def index_linear_issues(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Linear issues and comments.
@ -50,6 +59,7 @@ async def index_linear_issues(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -188,6 +198,9 @@ async def index_linear_issues(
documents_skipped = 0
skipped_issues = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(issues)} Linear issues",
@ -196,6 +209,11 @@ async def index_linear_issues(
# Process each issue
for issue in issues:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
issue_id = issue.get("id", "")
issue_identifier = issue.get("identifier", "")

View file

@ -2,6 +2,8 @@
Luma connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime, timedelta
from sqlalchemy.exc import SQLAlchemyError
@ -19,6 +21,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
check_document_by_unique_identifier,
check_duplicate_document_by_hash,
@ -37,6 +45,7 @@ async def index_luma_events(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Luma events.
@ -50,6 +59,7 @@ async def index_luma_events(
end_date: End date for indexing (YYYY-MM-DD format). Can be in the future to index upcoming events.
Defaults to today if not provided.
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -221,7 +231,14 @@ async def index_luma_events(
documents_skipped = 0
skipped_events = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for event in events:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
# Luma event structure fields - events have nested 'event' field
event_data = event.get("event", {})

View file

@ -2,6 +2,7 @@
Notion connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
@ -34,6 +35,13 @@ from .base import (
# Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds) -> None
RetryCallbackType = Callable[[str, int, int, float], Awaitable[None]]
# Type alias for heartbeat callback
# Signature: async callback(indexed_count) -> None
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_notion_pages(
session: AsyncSession,
@ -44,6 +52,7 @@ async def index_notion_pages(
end_date: str | None = None,
update_last_indexed: bool = True,
on_retry_callback: RetryCallbackType | None = None,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Notion pages from all accessible pages.
@ -59,6 +68,8 @@ async def index_notion_pages(
on_retry_callback: Optional callback for retry progress notifications.
Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds)
retry_reason is one of: 'rate_limit', 'server_error', 'timeout'
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Called periodically with (indexed_count) to prevent task appearing stuck.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -211,6 +222,9 @@ async def index_notion_pages(
documents_skipped = 0
skipped_pages = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(pages)} Notion pages",
@ -219,6 +233,11 @@ async def index_notion_pages(
# Process each page
for page in pages:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
page_id = page.get("page_id")
page_title = page.get("title", f"Untitled page ({page_id})")

View file

@ -7,6 +7,8 @@ This connector is only available in self-hosted mode.
import os
import re
import time
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime
from pathlib import Path
@ -25,6 +27,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
build_document_metadata_string,
check_document_by_unique_identifier,
@ -152,6 +160,7 @@ async def index_obsidian_vault(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index notes from a local Obsidian vault.
@ -167,6 +176,7 @@ async def index_obsidian_vault(
start_date: Start date for filtering (YYYY-MM-DD format) - optional
end_date: End date for filtering (YYYY-MM-DD format) - optional
update_last_indexed: Whether to update the last_indexed_at timestamp
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -305,7 +315,14 @@ async def index_obsidian_vault(
indexed_count = 0
skipped_count = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for file_info in files:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(indexed_count)
last_heartbeat_time = time.time()
try:
file_path = file_info["path"]
relative_path = file_info["relative_path"]

View file

@ -2,6 +2,8 @@
Slack connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from slack_sdk.errors import SlackApiError
@ -18,6 +20,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
build_document_metadata_markdown,
calculate_date_range,
@ -38,6 +46,7 @@ async def index_slack_messages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Slack messages from all accessible channels.
@ -50,6 +59,8 @@ async def index_slack_messages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Called periodically with (indexed_count) to prevent task appearing stuck.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -164,6 +175,9 @@ async def index_slack_messages(
documents_skipped = 0
skipped_channels = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(channels)} Slack channels",
@ -172,6 +186,10 @@ async def index_slack_messages(
# Process each channel
for channel_obj in channels:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
channel_id = channel_obj["id"]
channel_name = channel_obj["name"]
is_private = channel_obj["is_private"]

View file

@ -2,6 +2,8 @@
Microsoft Teams connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import UTC
from sqlalchemy.exc import SQLAlchemyError
@ -17,6 +19,12 @@ from app.utils.document_converters import (
generate_unique_identifier_hash,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
build_document_metadata_markdown,
calculate_date_range,
@ -37,6 +45,7 @@ async def index_teams_messages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Microsoft Teams messages from all accessible teams and channels.
@ -49,6 +58,8 @@ async def index_teams_messages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Called periodically with (indexed_count) to prevent task appearing stuck.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -161,6 +172,9 @@ async def index_teams_messages(
documents_skipped = 0
skipped_channels = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(teams)} Teams",
@ -185,6 +199,11 @@ async def index_teams_messages(
# Process each team
for team in teams:
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
team_id = team.get("id")
team_name = team.get("displayName", "Unknown Team")

View file

@ -2,6 +2,8 @@
Webcrawler connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -20,6 +22,12 @@ from app.utils.document_converters import (
)
from app.utils.webcrawler_utils import parse_webcrawler_urls
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
from .base import (
check_document_by_unique_identifier,
check_duplicate_document_by_hash,
@ -38,6 +46,7 @@ async def index_crawled_urls(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index web page URLs.
@ -50,6 +59,7 @@ async def index_crawled_urls(
start_date: Start date for filtering (YYYY-MM-DD format) - optional
end_date: End date for filtering (YYYY-MM-DD format) - optional
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -140,7 +150,14 @@ async def index_crawled_urls(
documents_skipped = 0
failed_urls = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for idx, url in enumerate(urls, 1):
# Check if it's time for a heartbeat update
if on_heartbeat_callback and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
logger.info(f"Processing URL {idx}/{len(urls)}: {url}")