mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 09:16:22 +02:00
Merge branch 'dev' of https://github.com/MODSetter/SurfSense into dev
This commit is contained in:
commit
8301e0169c
71 changed files with 2889 additions and 732 deletions
|
|
@ -0,0 +1,164 @@
|
|||
"""Celery task to detect and mark stale connector indexing notifications as failed.
|
||||
|
||||
This task runs periodically (every 5 minutes by default) to find notifications
|
||||
that are stuck in "in_progress" status but don't have an active Redis heartbeat key.
|
||||
These are marked as "failed" to prevent the frontend from showing a perpetual "syncing" state.
|
||||
|
||||
Detection mechanism:
|
||||
- Active indexing tasks set a Redis key with TTL (2 minutes) as a heartbeat
|
||||
- If the task crashes, the Redis key expires automatically
|
||||
- This cleanup task checks for in-progress notifications without a Redis heartbeat key
|
||||
- Such notifications are marked as failed with O(1) batch UPDATE
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import UTC, datetime
|
||||
|
||||
import redis
|
||||
from sqlalchemy import and_, text
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.pool import NullPool
|
||||
|
||||
from app.celery_app import celery_app
|
||||
from app.config import config
|
||||
from app.db import Notification
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Redis client for checking heartbeats
|
||||
_redis_client: redis.Redis | None = None
|
||||
|
||||
|
||||
def get_redis_client() -> redis.Redis:
|
||||
"""Get or create Redis client for heartbeat checking."""
|
||||
global _redis_client
|
||||
if _redis_client is None:
|
||||
redis_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||
_redis_client = redis.from_url(redis_url, decode_responses=True)
|
||||
return _redis_client
|
||||
|
||||
|
||||
def _get_heartbeat_key(notification_id: int) -> str:
|
||||
"""Generate Redis key for notification heartbeat."""
|
||||
return f"indexing:heartbeat:{notification_id}"
|
||||
|
||||
|
||||
def get_celery_session_maker():
|
||||
"""Create async session maker for Celery tasks."""
|
||||
engine = create_async_engine(
|
||||
config.DATABASE_URL,
|
||||
poolclass=NullPool,
|
||||
echo=False,
|
||||
)
|
||||
return async_sessionmaker(engine, expire_on_commit=False)
|
||||
|
||||
|
||||
@celery_app.task(name="cleanup_stale_indexing_notifications")
|
||||
def cleanup_stale_indexing_notifications_task():
|
||||
"""
|
||||
Check for stale connector indexing notifications and mark them as failed.
|
||||
|
||||
This task finds notifications that:
|
||||
- Have type = 'connector_indexing'
|
||||
- Have metadata.status = 'in_progress'
|
||||
- Do NOT have a corresponding Redis heartbeat key (meaning task crashed)
|
||||
|
||||
And marks them as failed with O(1) batch UPDATE.
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
loop.run_until_complete(_cleanup_stale_notifications())
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
||||
async def _cleanup_stale_notifications():
|
||||
"""Find and mark stale connector indexing notifications as failed.
|
||||
|
||||
Uses Redis TTL-based detection:
|
||||
1. Find all in-progress notifications
|
||||
2. Check which ones are missing their Redis heartbeat key
|
||||
3. Mark those as failed with O(1) batch UPDATE using JSONB || operator
|
||||
"""
|
||||
async with get_celery_session_maker()() as session:
|
||||
try:
|
||||
# Find all in-progress connector indexing notifications
|
||||
result = await session.execute(
|
||||
select(Notification.id).where(
|
||||
and_(
|
||||
Notification.type == "connector_indexing",
|
||||
Notification.notification_metadata["status"].astext
|
||||
== "in_progress",
|
||||
)
|
||||
)
|
||||
)
|
||||
in_progress_ids = [row[0] for row in result.fetchall()]
|
||||
|
||||
if not in_progress_ids:
|
||||
logger.debug("No in-progress connector indexing notifications found")
|
||||
return
|
||||
|
||||
# Check which ones are missing heartbeat keys in Redis
|
||||
redis_client = get_redis_client()
|
||||
stale_notification_ids = []
|
||||
|
||||
for notification_id in in_progress_ids:
|
||||
heartbeat_key = _get_heartbeat_key(notification_id)
|
||||
if not redis_client.exists(heartbeat_key):
|
||||
stale_notification_ids.append(notification_id)
|
||||
|
||||
if not stale_notification_ids:
|
||||
logger.debug(
|
||||
f"All {len(in_progress_ids)} in-progress notifications have active Redis heartbeats"
|
||||
)
|
||||
return
|
||||
|
||||
logger.warning(
|
||||
f"Found {len(stale_notification_ids)} stale connector indexing notifications "
|
||||
f"(no Redis heartbeat key): {stale_notification_ids}"
|
||||
)
|
||||
|
||||
# O(1) Batch UPDATE using JSONB || operator
|
||||
# This merges the update data into existing notification_metadata
|
||||
# Also updates title and message for proper UI display
|
||||
error_message = (
|
||||
"Something went wrong while syncing your content. Please retry."
|
||||
)
|
||||
|
||||
update_data = {
|
||||
"status": "failed",
|
||||
"completed_at": datetime.now(UTC).isoformat(),
|
||||
"error_message": error_message,
|
||||
"sync_stage": "failed",
|
||||
}
|
||||
|
||||
await session.execute(
|
||||
text("""
|
||||
UPDATE notifications
|
||||
SET metadata = metadata || CAST(:update_json AS jsonb),
|
||||
title = 'Failed: ' || COALESCE(metadata->>'connector_name', 'Connector'),
|
||||
message = :display_message
|
||||
WHERE id = ANY(:ids)
|
||||
"""),
|
||||
{
|
||||
"update_json": json.dumps(update_data),
|
||||
"display_message": f"{error_message}",
|
||||
"ids": stale_notification_ids,
|
||||
},
|
||||
)
|
||||
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Successfully marked {len(stale_notification_ids)} stale notifications as failed (batch UPDATE)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up stale notifications: {e!s}", exc_info=True)
|
||||
await session.rollback()
|
||||
|
|
@ -9,6 +9,7 @@ to avoid circular import issues with the connector_indexers package.
|
|||
"""
|
||||
|
||||
import logging
|
||||
from collections.abc import Awaitable, Callable
|
||||
from importlib import import_module
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -22,6 +23,9 @@ from app.db import (
|
|||
from app.services.composio_service import INDEXABLE_TOOLKITS, TOOLKIT_TO_INDEXER
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
# Type alias for heartbeat callback function
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Set up logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -86,6 +90,7 @@ async def index_composio_connector(
|
|||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
max_items: int = 1000,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int, str | None]:
|
||||
"""
|
||||
Index content from a Composio connector.
|
||||
|
|
@ -102,6 +107,7 @@ async def index_composio_connector(
|
|||
end_date: End date for filtering (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp
|
||||
max_items: Maximum number of items to fetch
|
||||
on_heartbeat_callback: Optional callback to report progress for heartbeat updates
|
||||
|
||||
Returns:
|
||||
Tuple of (number_of_indexed_items, number_of_skipped_items, error_message or None)
|
||||
|
|
@ -180,6 +186,7 @@ async def index_composio_connector(
|
|||
"log_entry": log_entry,
|
||||
"update_last_indexed": update_last_indexed,
|
||||
"max_items": max_items,
|
||||
"on_heartbeat_callback": on_heartbeat_callback,
|
||||
}
|
||||
|
||||
# Add date params for toolkits that support them
|
||||
|
|
|
|||
|
|
@ -2,6 +2,9 @@
|
|||
Airtable connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
|
|
@ -27,6 +30,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_airtable_records(
|
||||
session: AsyncSession,
|
||||
|
|
@ -37,6 +46,7 @@ async def index_airtable_records(
|
|||
end_date: str | None = None,
|
||||
max_records: int = 2500,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Airtable records for a given connector.
|
||||
|
|
@ -50,6 +60,7 @@ async def index_airtable_records(
|
|||
end_date: End date for filtering records (YYYY-MM-DD)
|
||||
max_records: Maximum number of records to fetch per table
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple of (number_of_documents_processed, error_message)
|
||||
|
|
@ -127,8 +138,20 @@ async def index_airtable_records(
|
|||
|
||||
logger.info(f"Found {len(bases)} Airtable bases to process")
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
total_documents_indexed = 0
|
||||
|
||||
# Process each base
|
||||
for base in bases:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(total_documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
base_id = base.get("id")
|
||||
base_name = base.get("name", "Unknown Base")
|
||||
|
||||
|
|
@ -204,6 +227,15 @@ async def index_airtable_records(
|
|||
documents_skipped = 0
|
||||
# Process each record
|
||||
for record in records:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(total_documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
try:
|
||||
# Generate markdown content
|
||||
markdown_content = (
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
BookStack connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -29,6 +31,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_bookstack_pages(
|
||||
session: AsyncSession,
|
||||
|
|
@ -38,6 +46,7 @@ async def index_bookstack_pages(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index BookStack pages.
|
||||
|
|
@ -50,6 +59,7 @@ async def index_bookstack_pages(
|
|||
start_date: Start date for indexing (YYYY-MM-DD format)
|
||||
end_date: End date for indexing (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -179,7 +189,17 @@ async def index_bookstack_pages(
|
|||
skipped_pages = []
|
||||
documents_skipped = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for page in pages:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
page_id = page.get("id")
|
||||
page_name = page.get("name", "")
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ ClickUp connector indexer.
|
|||
"""
|
||||
|
||||
import contextlib
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -29,6 +31,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_clickup_tasks(
|
||||
session: AsyncSession,
|
||||
|
|
@ -38,6 +46,7 @@ async def index_clickup_tasks(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index tasks from ClickUp workspace.
|
||||
|
|
@ -50,6 +59,7 @@ async def index_clickup_tasks(
|
|||
start_date: Start date for filtering tasks (YYYY-MM-DD format)
|
||||
end_date: End date for filtering tasks (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple of (number of indexed tasks, error message if any)
|
||||
|
|
@ -132,6 +142,9 @@ async def index_clickup_tasks(
|
|||
documents_indexed = 0
|
||||
documents_skipped = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# Iterate workspaces and fetch tasks
|
||||
for workspace in workspaces:
|
||||
workspace_id = workspace.get("id")
|
||||
|
|
@ -170,6 +183,15 @@ async def index_clickup_tasks(
|
|||
)
|
||||
|
||||
for task in tasks:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
try:
|
||||
task_id = task.get("id")
|
||||
task_name = task.get("name", "Untitled Task")
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ Confluence connector indexer.
|
|||
"""
|
||||
|
||||
import contextlib
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -30,6 +32,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_confluence_pages(
|
||||
session: AsyncSession,
|
||||
|
|
@ -39,6 +47,7 @@ async def index_confluence_pages(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Confluence pages and comments.
|
||||
|
|
@ -51,6 +60,7 @@ async def index_confluence_pages(
|
|||
start_date: Start date for indexing (YYYY-MM-DD format)
|
||||
end_date: End date for indexing (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -175,7 +185,17 @@ async def index_confluence_pages(
|
|||
skipped_pages = []
|
||||
documents_skipped = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for page in pages:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
page_id = page.get("id")
|
||||
page_title = page.get("title", "")
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ Discord connector indexer.
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import UTC, datetime, timedelta
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -28,6 +30,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds - update notification every 30 seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_discord_messages(
|
||||
session: AsyncSession,
|
||||
|
|
@ -37,6 +45,7 @@ async def index_discord_messages(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Discord messages from all accessible channels.
|
||||
|
|
@ -49,6 +58,8 @@ async def index_discord_messages(
|
|||
start_date: Start date for indexing (YYYY-MM-DD format)
|
||||
end_date: End date for indexing (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
Called periodically with (indexed_count) to prevent task appearing stuck.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -281,6 +292,9 @@ async def index_discord_messages(
|
|||
documents_skipped = 0
|
||||
skipped_channels: list[str] = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# Process each guild and channel
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
|
|
@ -290,6 +304,14 @@ async def index_discord_messages(
|
|||
|
||||
try:
|
||||
for guild in guilds:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
guild_id = guild["id"]
|
||||
guild_name = guild["name"]
|
||||
logger.info(f"Processing guild: {guild_name} ({guild_id})")
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ Elasticsearch indexer for SurfSense
|
|||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
|
|
@ -25,6 +27,12 @@ from .base import (
|
|||
get_current_timestamp,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -36,6 +44,7 @@ async def index_elasticsearch_documents(
|
|||
start_date: str,
|
||||
end_date: str,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index documents from Elasticsearch into SurfSense
|
||||
|
|
@ -48,6 +57,7 @@ async def index_elasticsearch_documents(
|
|||
start_date: Start date for indexing (not used for Elasticsearch, kept for compatibility)
|
||||
end_date: End date for indexing (not used for Elasticsearch, kept for compatibility)
|
||||
update_last_indexed: Whether to update the last indexed timestamp
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple of (number of documents processed, error message if any)
|
||||
|
|
@ -155,6 +165,9 @@ async def index_elasticsearch_documents(
|
|||
|
||||
documents_processed = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
try:
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
|
|
@ -172,6 +185,15 @@ async def index_elasticsearch_documents(
|
|||
size=min(max_documents, 100), # Scroll in batches
|
||||
fields=config.get("ELASTICSEARCH_FIELDS"),
|
||||
):
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time)
|
||||
>= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_processed)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
if documents_processed >= max_documents:
|
||||
break
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ This indexer processes entire repository digests in one pass, dramatically
|
|||
reducing LLM API calls compared to the previous file-by-file approach.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -30,6 +32,12 @@ from .base import (
|
|||
logger,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds - update notification every 30 seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
# Maximum tokens for a single digest before splitting
|
||||
# Most LLMs can handle 128k+ tokens now, but we'll be conservative
|
||||
MAX_DIGEST_CHARS = 500_000 # ~125k tokens
|
||||
|
|
@ -43,6 +51,7 @@ async def index_github_repos(
|
|||
start_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
|
||||
end_date: str | None = None, # Ignored - GitHub indexes full repo snapshots
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index GitHub repositories using gitingest for efficient processing.
|
||||
|
|
@ -62,6 +71,7 @@ async def index_github_repos(
|
|||
start_date: Ignored - kept for API compatibility
|
||||
end_date: Ignored - kept for API compatibility
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -168,7 +178,18 @@ async def index_github_repos(
|
|||
f"Starting gitingest indexing for {len(repo_full_names_to_index)} repositories."
|
||||
)
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
documents_indexed = 0
|
||||
|
||||
for repo_full_name in repo_full_names_to_index:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
if not repo_full_name or not isinstance(repo_full_name, str):
|
||||
logger.warning(f"Skipping invalid repository entry: {repo_full_name}")
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
Google Calendar connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pytz
|
||||
from dateutil.parser import isoparse
|
||||
from google.oauth2.credentials import Credentials
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
|
@ -30,6 +30,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_google_calendar_events(
|
||||
session: AsyncSession,
|
||||
|
|
@ -39,6 +45,7 @@ async def index_google_calendar_events(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Google Calendar events.
|
||||
|
|
@ -52,6 +59,7 @@ async def index_google_calendar_events(
|
|||
end_date: End date for indexing (YYYY-MM-DD format). Can be in the future to index upcoming events.
|
||||
Defaults to today if not provided.
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -281,7 +289,17 @@ async def index_google_calendar_events(
|
|||
0 # Track events skipped due to duplicate content_hash
|
||||
)
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for event in events:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
event_id = event.get("id")
|
||||
event_summary = event.get("summary", "No Title")
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
"""Google Drive indexer using Surfsense file processors."""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
|
@ -24,6 +26,12 @@ from app.tasks.connector_indexers.base import (
|
|||
)
|
||||
from app.utils.document_converters import generate_unique_identifier_hash
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -38,6 +46,7 @@ async def index_google_drive_files(
|
|||
update_last_indexed: bool = True,
|
||||
max_files: int = 500,
|
||||
include_subfolders: bool = False,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Google Drive files for a specific connector.
|
||||
|
|
@ -53,6 +62,7 @@ async def index_google_drive_files(
|
|||
update_last_indexed: Whether to update last_indexed_at timestamp
|
||||
max_files: Maximum number of files to index
|
||||
include_subfolders: Whether to recursively index files in subfolders
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple of (number_of_indexed_files, error_message)
|
||||
|
|
@ -147,6 +157,7 @@ async def index_google_drive_files(
|
|||
log_entry=log_entry,
|
||||
max_files=max_files,
|
||||
include_subfolders=include_subfolders,
|
||||
on_heartbeat_callback=on_heartbeat_callback,
|
||||
)
|
||||
else:
|
||||
logger.info(f"Using full scan for connector {connector_id}")
|
||||
|
|
@ -163,6 +174,7 @@ async def index_google_drive_files(
|
|||
log_entry=log_entry,
|
||||
max_files=max_files,
|
||||
include_subfolders=include_subfolders,
|
||||
on_heartbeat_callback=on_heartbeat_callback,
|
||||
)
|
||||
|
||||
documents_indexed, documents_skipped = result
|
||||
|
|
@ -383,6 +395,7 @@ async def _index_full_scan(
|
|||
log_entry: any,
|
||||
max_files: int,
|
||||
include_subfolders: bool = False,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int]:
|
||||
"""Perform full scan indexing of a folder."""
|
||||
await task_logger.log_task_progress(
|
||||
|
|
@ -399,10 +412,20 @@ async def _index_full_scan(
|
|||
documents_skipped = 0
|
||||
files_processed = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
# Queue of folders to process: (folder_id, folder_name)
|
||||
folders_to_process = [(folder_id, folder_name)]
|
||||
|
||||
while folders_to_process and files_processed < max_files:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
current_folder_id, current_folder_name = folders_to_process.pop(0)
|
||||
logger.info(f"Processing folder: {current_folder_name} ({current_folder_id})")
|
||||
page_token = None
|
||||
|
|
@ -485,6 +508,7 @@ async def _index_with_delta_sync(
|
|||
log_entry: any,
|
||||
max_files: int,
|
||||
include_subfolders: bool = False,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, int]:
|
||||
"""Perform delta sync indexing using change tracking.
|
||||
|
||||
|
|
@ -515,7 +539,17 @@ async def _index_with_delta_sync(
|
|||
documents_skipped = 0
|
||||
files_processed = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for change in changes:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
if files_processed >= max_files:
|
||||
break
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
Google Gmail connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from google.oauth2.credentials import Credentials
|
||||
|
|
@ -33,6 +35,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_google_gmail_messages(
|
||||
session: AsyncSession,
|
||||
|
|
@ -43,6 +51,7 @@ async def index_google_gmail_messages(
|
|||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
max_messages: int = 1000,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str]:
|
||||
"""
|
||||
Index Gmail messages for a specific connector.
|
||||
|
|
@ -56,6 +65,7 @@ async def index_google_gmail_messages(
|
|||
end_date: End date for filtering messages (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
max_messages: Maximum number of messages to fetch (default: 100)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple of (number_of_indexed_messages, status_message)
|
||||
|
|
@ -212,7 +222,18 @@ async def index_google_gmail_messages(
|
|||
documents_indexed = 0
|
||||
skipped_messages = []
|
||||
documents_skipped = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for message in messages:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
# Extract message information
|
||||
message_id = message.get("id", "")
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ Jira connector indexer.
|
|||
"""
|
||||
|
||||
import contextlib
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -30,6 +32,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds - update notification every 30 seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_jira_issues(
|
||||
session: AsyncSession,
|
||||
|
|
@ -39,6 +47,7 @@ async def index_jira_issues(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Jira issues and comments.
|
||||
|
|
@ -51,6 +60,7 @@ async def index_jira_issues(
|
|||
start_date: Start date for indexing (YYYY-MM-DD format)
|
||||
end_date: End date for indexing (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -169,7 +179,17 @@ async def index_jira_issues(
|
|||
skipped_issues = []
|
||||
documents_skipped = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for issue in issues:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
issue_id = issue.get("key")
|
||||
issue_identifier = issue.get("key", "")
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
Linear connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -29,6 +31,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds - update notification every 30 seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_linear_issues(
|
||||
session: AsyncSession,
|
||||
|
|
@ -38,6 +46,7 @@ async def index_linear_issues(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Linear issues and comments.
|
||||
|
|
@ -50,6 +59,7 @@ async def index_linear_issues(
|
|||
start_date: Start date for indexing (YYYY-MM-DD format)
|
||||
end_date: End date for indexing (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -188,6 +198,9 @@ async def index_linear_issues(
|
|||
documents_skipped = 0
|
||||
skipped_issues = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting to process {len(issues)} Linear issues",
|
||||
|
|
@ -196,6 +209,14 @@ async def index_linear_issues(
|
|||
|
||||
# Process each issue
|
||||
for issue in issues:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
try:
|
||||
issue_id = issue.get("id", "")
|
||||
issue_identifier = issue.get("identifier", "")
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
Luma connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -28,6 +30,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_luma_events(
|
||||
session: AsyncSession,
|
||||
|
|
@ -37,6 +45,7 @@ async def index_luma_events(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Luma events.
|
||||
|
|
@ -50,6 +59,7 @@ async def index_luma_events(
|
|||
end_date: End date for indexing (YYYY-MM-DD format). Can be in the future to index upcoming events.
|
||||
Defaults to today if not provided.
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -221,7 +231,17 @@ async def index_luma_events(
|
|||
documents_skipped = 0
|
||||
skipped_events = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for event in events:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
# Luma event structure fields - events have nested 'event' field
|
||||
event_data = event.get("event", {})
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
Notion connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
|
|
@ -34,6 +35,13 @@ from .base import (
|
|||
# Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds) -> None
|
||||
RetryCallbackType = Callable[[str, int, int, float], Awaitable[None]]
|
||||
|
||||
# Type alias for heartbeat callback
|
||||
# Signature: async callback(indexed_count) -> None
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds - update notification every 30 seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_notion_pages(
|
||||
session: AsyncSession,
|
||||
|
|
@ -44,6 +52,7 @@ async def index_notion_pages(
|
|||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_retry_callback: RetryCallbackType | None = None,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Notion pages from all accessible pages.
|
||||
|
|
@ -59,6 +68,8 @@ async def index_notion_pages(
|
|||
on_retry_callback: Optional callback for retry progress notifications.
|
||||
Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds)
|
||||
retry_reason is one of: 'rate_limit', 'server_error', 'timeout'
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
Called periodically with (indexed_count) to prevent task appearing stuck.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -211,6 +222,9 @@ async def index_notion_pages(
|
|||
documents_skipped = 0
|
||||
skipped_pages = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting to process {len(pages)} Notion pages",
|
||||
|
|
@ -219,6 +233,14 @@ async def index_notion_pages(
|
|||
|
||||
# Process each page
|
||||
for page in pages:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
try:
|
||||
page_id = page.get("page_id")
|
||||
page_title = page.get("title", f"Untitled page ({page_id})")
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ This connector is only available in self-hosted mode.
|
|||
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -35,6 +37,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
def parse_frontmatter(content: str) -> tuple[dict | None, str]:
|
||||
"""
|
||||
|
|
@ -152,6 +160,7 @@ async def index_obsidian_vault(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index notes from a local Obsidian vault.
|
||||
|
|
@ -167,6 +176,7 @@ async def index_obsidian_vault(
|
|||
start_date: Start date for filtering (YYYY-MM-DD format) - optional
|
||||
end_date: End date for filtering (YYYY-MM-DD format) - optional
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -305,7 +315,17 @@ async def index_obsidian_vault(
|
|||
indexed_count = 0
|
||||
skipped_count = 0
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for file_info in files:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(indexed_count)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
file_path = file_info["path"]
|
||||
relative_path = file_info["relative_path"]
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
Slack connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from slack_sdk.errors import SlackApiError
|
||||
|
|
@ -29,6 +31,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds - update notification every 30 seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_slack_messages(
|
||||
session: AsyncSession,
|
||||
|
|
@ -38,6 +46,7 @@ async def index_slack_messages(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Slack messages from all accessible channels.
|
||||
|
|
@ -50,6 +59,8 @@ async def index_slack_messages(
|
|||
start_date: Start date for indexing (YYYY-MM-DD format)
|
||||
end_date: End date for indexing (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
Called periodically with (indexed_count) to prevent task appearing stuck.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -164,6 +175,9 @@ async def index_slack_messages(
|
|||
documents_skipped = 0
|
||||
skipped_channels = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting to process {len(channels)} Slack channels",
|
||||
|
|
@ -172,6 +186,13 @@ async def index_slack_messages(
|
|||
|
||||
# Process each channel
|
||||
for channel_obj in channels:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
channel_id = channel_obj["id"]
|
||||
channel_name = channel_obj["name"]
|
||||
is_private = channel_obj["is_private"]
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
Microsoft Teams connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import UTC
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -28,6 +30,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds - update notification every 30 seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_teams_messages(
|
||||
session: AsyncSession,
|
||||
|
|
@ -37,6 +45,7 @@ async def index_teams_messages(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index Microsoft Teams messages from all accessible teams and channels.
|
||||
|
|
@ -49,6 +58,8 @@ async def index_teams_messages(
|
|||
start_date: Start date for indexing (YYYY-MM-DD format)
|
||||
end_date: End date for indexing (YYYY-MM-DD format)
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
Called periodically with (indexed_count) to prevent task appearing stuck.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -161,6 +172,9 @@ async def index_teams_messages(
|
|||
documents_skipped = 0
|
||||
skipped_channels = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting to process {len(teams)} Teams",
|
||||
|
|
@ -185,6 +199,14 @@ async def index_teams_messages(
|
|||
|
||||
# Process each team
|
||||
for team in teams:
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
team_id = team.get("id")
|
||||
team_name = team.get("displayName", "Unknown Team")
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
Webcrawler connector indexer.
|
||||
"""
|
||||
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
|
@ -29,6 +31,12 @@ from .base import (
|
|||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
# Type hint for heartbeat callback
|
||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||
|
||||
# Heartbeat interval in seconds
|
||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||
|
||||
|
||||
async def index_crawled_urls(
|
||||
session: AsyncSession,
|
||||
|
|
@ -38,6 +46,7 @@ async def index_crawled_urls(
|
|||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
update_last_indexed: bool = True,
|
||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index web page URLs.
|
||||
|
|
@ -50,6 +59,7 @@ async def index_crawled_urls(
|
|||
start_date: Start date for filtering (YYYY-MM-DD format) - optional
|
||||
end_date: End date for filtering (YYYY-MM-DD format) - optional
|
||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||
|
||||
Returns:
|
||||
Tuple containing (number of documents indexed, error message or None)
|
||||
|
|
@ -140,7 +150,17 @@ async def index_crawled_urls(
|
|||
documents_skipped = 0
|
||||
failed_urls = []
|
||||
|
||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
||||
last_heartbeat_time = time.time()
|
||||
|
||||
for idx, url in enumerate(urls, 1):
|
||||
# Check if it's time for a heartbeat update
|
||||
if (
|
||||
on_heartbeat_callback
|
||||
and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS
|
||||
):
|
||||
await on_heartbeat_callback(documents_indexed)
|
||||
last_heartbeat_time = time.time()
|
||||
try:
|
||||
logger.info(f"Processing URL {idx}/{len(urls)}: {url}")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue