This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-01 18:02:27 -08:00
commit 8301e0169c
71 changed files with 2889 additions and 732 deletions

View file

@ -0,0 +1,164 @@
"""Celery task to detect and mark stale connector indexing notifications as failed.
This task runs periodically (every 5 minutes by default) to find notifications
that are stuck in "in_progress" status but don't have an active Redis heartbeat key.
These are marked as "failed" to prevent the frontend from showing a perpetual "syncing" state.
Detection mechanism:
- Active indexing tasks set a Redis key with TTL (2 minutes) as a heartbeat
- If the task crashes, the Redis key expires automatically
- This cleanup task checks for in-progress notifications without a Redis heartbeat key
- Such notifications are marked as failed with O(1) batch UPDATE
"""
import json
import logging
import os
from datetime import UTC, datetime
import redis
from sqlalchemy import and_, text
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.future import select
from sqlalchemy.pool import NullPool
from app.celery_app import celery_app
from app.config import config
from app.db import Notification
logger = logging.getLogger(__name__)
# Redis client for checking heartbeats
_redis_client: redis.Redis | None = None
def get_redis_client() -> redis.Redis:
"""Get or create Redis client for heartbeat checking."""
global _redis_client
if _redis_client is None:
redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
_redis_client = redis.from_url(redis_url, decode_responses=True)
return _redis_client
def _get_heartbeat_key(notification_id: int) -> str:
"""Generate Redis key for notification heartbeat."""
return f"indexing:heartbeat:{notification_id}"
def get_celery_session_maker():
"""Create async session maker for Celery tasks."""
engine = create_async_engine(
config.DATABASE_URL,
poolclass=NullPool,
echo=False,
)
return async_sessionmaker(engine, expire_on_commit=False)
@celery_app.task(name="cleanup_stale_indexing_notifications")
def cleanup_stale_indexing_notifications_task():
"""
Check for stale connector indexing notifications and mark them as failed.
This task finds notifications that:
- Have type = 'connector_indexing'
- Have metadata.status = 'in_progress'
- Do NOT have a corresponding Redis heartbeat key (meaning task crashed)
And marks them as failed with O(1) batch UPDATE.
"""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(_cleanup_stale_notifications())
finally:
loop.close()
async def _cleanup_stale_notifications():
"""Find and mark stale connector indexing notifications as failed.
Uses Redis TTL-based detection:
1. Find all in-progress notifications
2. Check which ones are missing their Redis heartbeat key
3. Mark those as failed with O(1) batch UPDATE using JSONB || operator
"""
async with get_celery_session_maker()() as session:
try:
# Find all in-progress connector indexing notifications
result = await session.execute(
select(Notification.id).where(
and_(
Notification.type == "connector_indexing",
Notification.notification_metadata["status"].astext
== "in_progress",
)
)
)
in_progress_ids = [row[0] for row in result.fetchall()]
if not in_progress_ids:
logger.debug("No in-progress connector indexing notifications found")
return
# Check which ones are missing heartbeat keys in Redis
redis_client = get_redis_client()
stale_notification_ids = []
for notification_id in in_progress_ids:
heartbeat_key = _get_heartbeat_key(notification_id)
if not redis_client.exists(heartbeat_key):
stale_notification_ids.append(notification_id)
if not stale_notification_ids:
logger.debug(
f"All {len(in_progress_ids)} in-progress notifications have active Redis heartbeats"
)
return
logger.warning(
f"Found {len(stale_notification_ids)} stale connector indexing notifications "
f"(no Redis heartbeat key): {stale_notification_ids}"
)
# O(1) Batch UPDATE using JSONB || operator
# This merges the update data into existing notification_metadata
# Also updates title and message for proper UI display
error_message = (
"Something went wrong while syncing your content. Please retry."
)
update_data = {
"status": "failed",
"completed_at": datetime.now(UTC).isoformat(),
"error_message": error_message,
"sync_stage": "failed",
}
await session.execute(
text("""
UPDATE notifications
SET metadata = metadata || CAST(:update_json AS jsonb),
title = 'Failed: ' || COALESCE(metadata->>'connector_name', 'Connector'),
message = :display_message
WHERE id = ANY(:ids)
"""),
{
"update_json": json.dumps(update_data),
"display_message": f"{error_message}",
"ids": stale_notification_ids,
},
)
await session.commit()
logger.info(
f"Successfully marked {len(stale_notification_ids)} stale notifications as failed (batch UPDATE)"
)
except Exception as e:
logger.error(f"Error cleaning up stale notifications: {e!s}", exc_info=True)
await session.rollback()

View file

@ -9,6 +9,7 @@ to avoid circular import issues with the connector_indexers package.
"""
import logging
from collections.abc import Awaitable, Callable
from importlib import import_module
from sqlalchemy.exc import SQLAlchemyError
@ -22,6 +23,9 @@ from app.db import (
from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_INDEXER
from app.services.task_logging_service import TaskLoggingService
# Type alias for heartbeat callback function
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Set up logging
logger = logging.getLogger(__name__)
@ -86,6 +90,7 @@ async def index_composio_connector(
end_date: str | None = None,
update_last_indexed: bool = True,
max_items: int = 1000,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int, str | None]:
"""
Index content from a Composio connector.
@ -102,6 +107,7 @@ async def index_composio_connector(
end_date: End date for filtering (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp
max_items: Maximum number of items to fetch
on_heartbeat_callback: Optional callback to report progress for heartbeat updates
Returns:
Tuple of (number_of_indexed_items, number_of_skipped_items, error_message or None)
@ -180,6 +186,7 @@ async def index_composio_connector(
"log_entry": log_entry,
"update_last_indexed": update_last_indexed,
"max_items": max_items,
"on_heartbeat_callback": on_heartbeat_callback,
}
# Add date params for toolkits that support them

View file

@ -2,6 +2,9 @@
Airtable connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@ -27,6 +30,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_airtable_records(
session: AsyncSession,
@ -37,6 +46,7 @@ async def index_airtable_records(
end_date: str | None = None,
max_records: int = 2500,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Airtable records for a given connector.
@ -50,6 +60,7 @@ async def index_airtable_records(
end_date: End date for filtering records (YYYY-MM-DD)
max_records: Maximum number of records to fetch per table
update_last_indexed: Whether to update the last_indexed_at timestamp
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number_of_documents_processed, error_message)
@ -127,8 +138,20 @@ async def index_airtable_records(
logger.info(f"Found {len(bases)} Airtable bases to process")
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
total_documents_indexed = 0
# Process each base
for base in bases:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(total_documents_indexed)
last_heartbeat_time = time.time()
base_id = base.get("id")
base_name = base.get("name", "Unknown Base")
@ -204,6 +227,15 @@ async def index_airtable_records(
documents_skipped = 0
# Process each record
for record in records:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(total_documents_indexed)
last_heartbeat_time = time.time()
try:
# Generate markdown content
markdown_content = (

View file

@ -2,6 +2,8 @@
BookStack connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -29,6 +31,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_bookstack_pages(
session: AsyncSession,
@ -38,6 +46,7 @@ async def index_bookstack_pages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index BookStack pages.
@ -50,6 +59,7 @@ async def index_bookstack_pages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -179,7 +189,17 @@ async def index_bookstack_pages(
skipped_pages = []
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for page in pages:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
page_id = page.get("id")
page_name = page.get("name", "")

View file

@ -3,6 +3,8 @@ ClickUp connector indexer.
"""
import contextlib
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -29,6 +31,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_clickup_tasks(
session: AsyncSession,
@ -38,6 +46,7 @@ async def index_clickup_tasks(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index tasks from ClickUp workspace.
@ -50,6 +59,7 @@ async def index_clickup_tasks(
start_date: Start date for filtering tasks (YYYY-MM-DD format)
end_date: End date for filtering tasks (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number of indexed tasks, error message if any)
@ -132,6 +142,9 @@ async def index_clickup_tasks(
documents_indexed = 0
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# Iterate workspaces and fetch tasks
for workspace in workspaces:
workspace_id = workspace.get("id")
@ -170,6 +183,15 @@ async def index_clickup_tasks(
)
for task in tasks:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
task_id = task.get("id")
task_name = task.get("name", "Untitled Task")

View file

@ -3,6 +3,8 @@ Confluence connector indexer.
"""
import contextlib
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -30,6 +32,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_confluence_pages(
session: AsyncSession,
@ -39,6 +47,7 @@ async def index_confluence_pages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Confluence pages and comments.
@ -51,6 +60,7 @@ async def index_confluence_pages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -175,7 +185,17 @@ async def index_confluence_pages(
skipped_pages = []
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for page in pages:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
page_id = page.get("id")
page_title = page.get("title", "")

View file

@ -3,6 +3,8 @@ Discord connector indexer.
"""
import asyncio
import time
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime, timedelta
from sqlalchemy.exc import SQLAlchemyError
@ -28,6 +30,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_discord_messages(
session: AsyncSession,
@ -37,6 +45,7 @@ async def index_discord_messages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Discord messages from all accessible channels.
@ -49,6 +58,8 @@ async def index_discord_messages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Called periodically with (indexed_count) to prevent task appearing stuck.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -281,6 +292,9 @@ async def index_discord_messages(
documents_skipped = 0
skipped_channels: list[str] = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# Process each guild and channel
await task_logger.log_task_progress(
log_entry,
@ -290,6 +304,14 @@ async def index_discord_messages(
try:
for guild in guilds:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
guild_id = guild["id"]
guild_name = guild["name"]
logger.info(f"Processing guild: {guild_name} ({guild_id})")

View file

@ -4,6 +4,8 @@ Elasticsearch indexer for SurfSense
import json
import logging
import time
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime
from typing import Any
@ -25,6 +27,12 @@ from .base import (
get_current_timestamp,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
logger = logging.getLogger(__name__)
@ -36,6 +44,7 @@ async def index_elasticsearch_documents(
start_date: str,
end_date: str,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index documents from Elasticsearch into SurfSense
@ -48,6 +57,7 @@ async def index_elasticsearch_documents(
start_date: Start date for indexing (not used for Elasticsearch, kept for compatibility)
end_date: End date for indexing (not used for Elasticsearch, kept for compatibility)
update_last_indexed: Whether to update the last indexed timestamp
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number of documents processed, error message if any)
@ -155,6 +165,9 @@ async def index_elasticsearch_documents(
documents_processed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
try:
await task_logger.log_task_progress(
log_entry,
@ -172,6 +185,15 @@ async def index_elasticsearch_documents(
size=min(max_documents, 100), # Scroll in batches
fields=config.get("ELASTICSEARCH_FIELDS"),
):
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time)
>= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_processed)
last_heartbeat_time = time.time()
if documents_processed >= max_documents:
break

View file

@ -5,6 +5,8 @@ This indexer processes entire repository digests in one pass, dramatically
reducing LLM API calls compared to the previous file-by-file approach.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime
from sqlalchemy.exc import SQLAlchemyError
@ -30,6 +32,12 @@ from .base import (
logger,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
# Maximum tokens for a single digest before splitting
# Most LLMs can handle 128k+ tokens now, but we'll be conservative
MAX_DIGEST_CHARS = 500_000 # ~125k tokens
@ -43,6 +51,7 @@ async def index_github_repos(
start_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
end_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index GitHub repositories using gitingest for efficient processing.
@ -62,6 +71,7 @@ async def index_github_repos(
start_date: Ignored - kept for API compatibility
end_date: Ignored - kept for API compatibility
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -168,7 +178,18 @@ async def index_github_repos(
f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
)
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
documents_indexed = 0
for repo_full_name in repo_full_names_to_index:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
if not repo_full_name or not isinstance(repo_full_name, str):
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
continue

View file

@ -2,10 +2,10 @@
Google Calendar connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime, timedelta
import pytz
from dateutil.parser import isoparse
from google.oauth2.credentials import Credentials
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@ -30,6 +30,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_google_calendar_events(
session: AsyncSession,
@ -39,6 +45,7 @@ async def index_google_calendar_events(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Google Calendar events.
@ -52,6 +59,7 @@ async def index_google_calendar_events(
end_date: End date for indexing (YYYY-MM-DD format). Can be in the future to index upcoming events.
Defaults to today if not provided.
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -281,7 +289,17 @@ async def index_google_calendar_events(
0 # Track events skipped due to duplicate content_hash
)
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for event in events:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
event_id = event.get("id")
event_summary = event.get("summary", "No Title")

View file

@ -1,6 +1,8 @@
"""Google Drive indexer using Surfsense file processors."""
import logging
import time
from collections.abc import Awaitable, Callable
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@ -24,6 +26,12 @@ from app.tasks.connector_indexers.base import (
)
from app.utils.document_converters import generate_unique_identifier_hash
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
logger = logging.getLogger(__name__)
@ -38,6 +46,7 @@ async def index_google_drive_files(
update_last_indexed: bool = True,
max_files: int = 500,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Google Drive files for a specific connector.
@ -53,6 +62,7 @@ async def index_google_drive_files(
update_last_indexed: Whether to update last_indexed_at timestamp
max_files: Maximum number of files to index
include_subfolders: Whether to recursively index files in subfolders
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number_of_indexed_files, error_message)
@ -147,6 +157,7 @@ async def index_google_drive_files(
log_entry=log_entry,
max_files=max_files,
include_subfolders=include_subfolders,
on_heartbeat_callback=on_heartbeat_callback,
)
else:
logger.info(f"Using full scan for connector {connector_id}")
@ -163,6 +174,7 @@ async def index_google_drive_files(
log_entry=log_entry,
max_files=max_files,
include_subfolders=include_subfolders,
on_heartbeat_callback=on_heartbeat_callback,
)
documents_indexed, documents_skipped = result
@ -383,6 +395,7 @@ async def _index_full_scan(
log_entry: any,
max_files: int,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int]:
"""Perform full scan indexing of a folder."""
await task_logger.log_task_progress(
@ -399,10 +412,20 @@ async def _index_full_scan(
documents_skipped = 0
files_processed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# Queue of folders to process: (folder_id, folder_name)
folders_to_process = [(folder_id, folder_name)]
while folders_to_process and files_processed < max_files:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
current_folder_id, current_folder_name = folders_to_process.pop(0)
logger.info(f"Processing folder: {current_folder_name} ({current_folder_id})")
page_token = None
@ -485,6 +508,7 @@ async def _index_with_delta_sync(
log_entry: any,
max_files: int,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int]:
"""Perform delta sync indexing using change tracking.
@ -515,7 +539,17 @@ async def _index_with_delta_sync(
documents_skipped = 0
files_processed = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for change in changes:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
if files_processed >= max_files:
break

View file

@ -2,6 +2,8 @@
Google Gmail connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from google.oauth2.credentials import Credentials
@ -33,6 +35,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_google_gmail_messages(
session: AsyncSession,
@ -43,6 +51,7 @@ async def index_google_gmail_messages(
end_date: str | None = None,
update_last_indexed: bool = True,
max_messages: int = 1000,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str]:
"""
Index Gmail messages for a specific connector.
@ -56,6 +65,7 @@ async def index_google_gmail_messages(
end_date: End date for filtering messages (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
max_messages: Maximum number of messages to fetch (default: 100)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple of (number_of_indexed_messages, status_message)
@ -212,7 +222,18 @@ async def index_google_gmail_messages(
documents_indexed = 0
skipped_messages = []
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for message in messages:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
# Extract message information
message_id = message.get("id", "")

View file

@ -3,6 +3,8 @@ Jira connector indexer.
"""
import contextlib
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -30,6 +32,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_jira_issues(
session: AsyncSession,
@ -39,6 +47,7 @@ async def index_jira_issues(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Jira issues and comments.
@ -51,6 +60,7 @@ async def index_jira_issues(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -169,7 +179,17 @@ async def index_jira_issues(
skipped_issues = []
documents_skipped = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for issue in issues:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
issue_id = issue.get("key")
issue_identifier = issue.get("key", "")

View file

@ -2,6 +2,8 @@
Linear connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -29,6 +31,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_linear_issues(
session: AsyncSession,
@ -38,6 +46,7 @@ async def index_linear_issues(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Linear issues and comments.
@ -50,6 +59,7 @@ async def index_linear_issues(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -188,6 +198,9 @@ async def index_linear_issues(
documents_skipped = 0
skipped_issues = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(issues)} Linear issues",
@ -196,6 +209,14 @@ async def index_linear_issues(
# Process each issue
for issue in issues:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
issue_id = issue.get("id", "")
issue_identifier = issue.get("identifier", "")

View file

@ -2,6 +2,8 @@
Luma connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime, timedelta
from sqlalchemy.exc import SQLAlchemyError
@ -28,6 +30,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_luma_events(
session: AsyncSession,
@ -37,6 +45,7 @@ async def index_luma_events(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Luma events.
@ -50,6 +59,7 @@ async def index_luma_events(
end_date: End date for indexing (YYYY-MM-DD format). Can be in the future to index upcoming events.
Defaults to today if not provided.
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -221,7 +231,17 @@ async def index_luma_events(
documents_skipped = 0
skipped_events = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for event in events:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
# Luma event structure fields - events have nested 'event' field
event_data = event.get("event", {})

View file

@ -2,6 +2,7 @@
Notion connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
@ -34,6 +35,13 @@ from .base import (
# Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds) -> None
RetryCallbackType = Callable[[str, int, int, float], Awaitable[None]]
# Type alias for heartbeat callback
# Signature: async callback(indexed_count) -> None
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_notion_pages(
session: AsyncSession,
@ -44,6 +52,7 @@ async def index_notion_pages(
end_date: str | None = None,
update_last_indexed: bool = True,
on_retry_callback: RetryCallbackType | None = None,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Notion pages from all accessible pages.
@ -59,6 +68,8 @@ async def index_notion_pages(
on_retry_callback: Optional callback for retry progress notifications.
Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds)
retry_reason is one of: 'rate_limit', 'server_error', 'timeout'
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Called periodically with (indexed_count) to prevent task appearing stuck.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -211,6 +222,9 @@ async def index_notion_pages(
documents_skipped = 0
skipped_pages = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(pages)} Notion pages",
@ -219,6 +233,14 @@ async def index_notion_pages(
# Process each page
for page in pages:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
page_id = page.get("page_id")
page_title = page.get("title", f"Untitled page ({page_id})")

View file

@ -7,6 +7,8 @@ This connector is only available in self-hosted mode.
import os
import re
import time
from collections.abc import Awaitable, Callable
from datetime import UTC, datetime
from pathlib import Path
@ -35,6 +37,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
def parse_frontmatter(content: str) -> tuple[dict | None, str]:
"""
@ -152,6 +160,7 @@ async def index_obsidian_vault(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index notes from a local Obsidian vault.
@ -167,6 +176,7 @@ async def index_obsidian_vault(
start_date: Start date for filtering (YYYY-MM-DD format) - optional
end_date: End date for filtering (YYYY-MM-DD format) - optional
update_last_indexed: Whether to update the last_indexed_at timestamp
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -305,7 +315,17 @@ async def index_obsidian_vault(
indexed_count = 0
skipped_count = 0
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for file_info in files:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(indexed_count)
last_heartbeat_time = time.time()
try:
file_path = file_info["path"]
relative_path = file_info["relative_path"]

View file

@ -2,6 +2,8 @@
Slack connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from slack_sdk.errors import SlackApiError
@ -29,6 +31,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_slack_messages(
session: AsyncSession,
@ -38,6 +46,7 @@ async def index_slack_messages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Slack messages from all accessible channels.
@ -50,6 +59,8 @@ async def index_slack_messages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Called periodically with (indexed_count) to prevent task appearing stuck.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -164,6 +175,9 @@ async def index_slack_messages(
documents_skipped = 0
skipped_channels = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(channels)} Slack channels",
@ -172,6 +186,13 @@ async def index_slack_messages(
# Process each channel
for channel_obj in channels:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
channel_id = channel_obj["id"]
channel_name = channel_obj["name"]
is_private = channel_obj["is_private"]

View file

@ -2,6 +2,8 @@
Microsoft Teams connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import UTC
from sqlalchemy.exc import SQLAlchemyError
@ -28,6 +30,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds - update notification every 30 seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_teams_messages(
session: AsyncSession,
@ -37,6 +45,7 @@ async def index_teams_messages(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index Microsoft Teams messages from all accessible teams and channels.
@ -49,6 +58,8 @@ async def index_teams_messages(
start_date: Start date for indexing (YYYY-MM-DD format)
end_date: End date for indexing (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Called periodically with (indexed_count) to prevent task appearing stuck.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -161,6 +172,9 @@ async def index_teams_messages(
documents_skipped = 0
skipped_channels = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
await task_logger.log_task_progress(
log_entry,
f"Starting to process {len(teams)} Teams",
@ -185,6 +199,14 @@ async def index_teams_messages(
# Process each team
for team in teams:
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
team_id = team.get("id")
team_name = team.get("displayName", "Unknown Team")

View file

@ -2,6 +2,8 @@
Webcrawler connector indexer.
"""
import time
from collections.abc import Awaitable, Callable
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
@ -29,6 +31,12 @@ from .base import (
update_connector_last_indexed,
)
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30
async def index_crawled_urls(
session: AsyncSession,
@ -38,6 +46,7 @@ async def index_crawled_urls(
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, str | None]:
"""
Index web page URLs.
@ -50,6 +59,7 @@ async def index_crawled_urls(
start_date: Start date for filtering (YYYY-MM-DD format) - optional
end_date: End date for filtering (YYYY-MM-DD format) - optional
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns:
Tuple containing (number of documents indexed, error message or None)
@ -140,7 +150,17 @@ async def index_crawled_urls(
documents_skipped = 0
failed_urls = []
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
for idx, url in enumerate(urls, 1):
# Check if it's time for a heartbeat update
if (
on_heartbeat_callback
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
):
await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = time.time()
try:
logger.info(f"Processing URL {idx}/{len(urls)}: {url}")