remove dead indexing tasks and fix silent schedule breakage for live connectors

This commit is contained in:
CREDO23 2026-04-22 21:43:51 +02:00
parent 0eae96bffb
commit 9977f9b641
5 changed files with 29 additions and 824 deletions

View file

@ -135,20 +135,12 @@ celery_app.conf.update(
# never block fast user-facing tasks (file uploads, podcasts, etc.)
task_routes={
# Connector indexing tasks → connectors queue
"index_slack_messages": {"queue": CONNECTORS_QUEUE},
"index_notion_pages": {"queue": CONNECTORS_QUEUE},
"index_github_repos": {"queue": CONNECTORS_QUEUE},
"index_linear_issues": {"queue": CONNECTORS_QUEUE},
"index_jira_issues": {"queue": CONNECTORS_QUEUE},
"index_confluence_pages": {"queue": CONNECTORS_QUEUE},
"index_clickup_tasks": {"queue": CONNECTORS_QUEUE},
"index_google_calendar_events": {"queue": CONNECTORS_QUEUE},
"index_airtable_records": {"queue": CONNECTORS_QUEUE},
"index_google_gmail_messages": {"queue": CONNECTORS_QUEUE},
"index_google_drive_files": {"queue": CONNECTORS_QUEUE},
"index_discord_messages": {"queue": CONNECTORS_QUEUE},
"index_teams_messages": {"queue": CONNECTORS_QUEUE},
"index_luma_events": {"queue": CONNECTORS_QUEUE},
"index_elasticsearch_documents": {"queue": CONNECTORS_QUEUE},
"index_crawled_urls": {"queue": CONNECTORS_QUEUE},
"index_bookstack_pages": {"queue": CONNECTORS_QUEUE},

View file

@ -1219,57 +1219,6 @@ async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id:
await session.rollback()
async def run_slack_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Create a new session and run the Slack indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
async with async_session_maker() as session:
await run_slack_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
async def run_slack_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Slack indexing.
Args:
session: Database session
connector_id: ID of the Slack connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.connector_indexers import index_slack_messages
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
indexing_function=index_slack_messages,
update_timestamp_func=_update_connector_timestamp_by_id,
supports_heartbeat_callback=True,
)
_AUTH_ERROR_PATTERNS = (
"failed to refresh linear oauth",
"failed to refresh your notion connection",
@ -1808,215 +1757,6 @@ async def run_github_indexing(
)
# Add new helper functions for Linear indexing
async def run_linear_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Wrapper to run Linear indexing with its own database session."""
logger.info(
f"Background task started: Indexing Linear connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
)
async with async_session_maker() as session:
await run_linear_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
logger.info(f"Background task finished: Indexing Linear connector {connector_id}")
async def run_linear_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Linear indexing.
Args:
session: Database session
connector_id: ID of the Linear connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.connector_indexers import index_linear_issues
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
indexing_function=index_linear_issues,
update_timestamp_func=_update_connector_timestamp_by_id,
supports_heartbeat_callback=True,
)
# Add new helper functions for discord indexing
async def run_discord_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Create a new session and run the Discord indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
async with async_session_maker() as session:
await run_discord_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
async def run_discord_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Discord indexing.
Args:
session: Database session
connector_id: ID of the Discord connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.connector_indexers import index_discord_messages
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
indexing_function=index_discord_messages,
update_timestamp_func=_update_connector_timestamp_by_id,
supports_heartbeat_callback=True,
)
async def run_teams_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Create a new session and run the Microsoft Teams indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
async with async_session_maker() as session:
await run_teams_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
async def run_teams_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Microsoft Teams indexing.
Args:
session: Database session
connector_id: ID of the Teams connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.connector_indexers.teams_indexer import index_teams_messages
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
indexing_function=index_teams_messages,
update_timestamp_func=_update_connector_timestamp_by_id,
supports_heartbeat_callback=True,
)
# Add new helper functions for Jira indexing
async def run_jira_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Wrapper to run Jira indexing with its own database session."""
logger.info(
f"Background task started: Indexing Jira connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
)
async with async_session_maker() as session:
await run_jira_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
logger.info(f"Background task finished: Indexing Jira connector {connector_id}")
async def run_jira_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Jira indexing.
Args:
session: Database session
connector_id: ID of the Jira connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.connector_indexers import index_jira_issues
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
indexing_function=index_jira_issues,
update_timestamp_func=_update_connector_timestamp_by_id,
supports_heartbeat_callback=True,
)
# Add new helper functions for Confluence indexing
async def run_confluence_indexing_with_new_session(
connector_id: int,
@ -2072,112 +1812,6 @@ async def run_confluence_indexing(
)
# Add new helper functions for ClickUp indexing
async def run_clickup_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Wrapper to run ClickUp indexing with its own database session."""
logger.info(
f"Background task started: Indexing ClickUp connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
)
async with async_session_maker() as session:
await run_clickup_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
logger.info(f"Background task finished: Indexing ClickUp connector {connector_id}")
async def run_clickup_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run ClickUp indexing.
Args:
session: Database session
connector_id: ID of the ClickUp connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.connector_indexers import index_clickup_tasks
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
indexing_function=index_clickup_tasks,
update_timestamp_func=_update_connector_timestamp_by_id,
supports_heartbeat_callback=True,
)
# Add new helper functions for Airtable indexing
async def run_airtable_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Wrapper to run Airtable indexing with its own database session."""
logger.info(
f"Background task started: Indexing Airtable connector {connector_id} into space {search_space_id} from {start_date} to {end_date}"
)
async with async_session_maker() as session:
await run_airtable_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
logger.info(f"Background task finished: Indexing Airtable connector {connector_id}")
async def run_airtable_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Airtable indexing.
Args:
session: Database session
connector_id: ID of the Airtable connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.connector_indexers import index_airtable_records
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
indexing_function=index_airtable_records,
update_timestamp_func=_update_connector_timestamp_by_id,
supports_heartbeat_callback=True,
)
# Add new helper functions for Google Calendar indexing
async def run_google_calendar_indexing_with_new_session(
connector_id: int,
@ -2716,58 +2350,6 @@ async def run_dropbox_indexing(
logger.error(f"Failed to update notification: {notif_error!s}")
# Add new helper functions for luma indexing
async def run_luma_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Create a new session and run the Luma indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
async with async_session_maker() as session:
await run_luma_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
async def run_luma_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Luma indexing.
Args:
session: Database session
connector_id: ID of the Luma connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
from app.tasks.connector_indexers import index_luma_events
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
indexing_function=index_luma_events,
update_timestamp_func=_update_connector_timestamp_by_id,
supports_heartbeat_callback=True,
)
async def run_elasticsearch_indexing_with_new_session(
connector_id: int,
search_space_id: int,

View file

@ -39,52 +39,6 @@ def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> N
)
@celery_app.task(name="index_slack_messages", bind=True)
def index_slack_messages_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Slack messages."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_slack_messages(
connector_id, search_space_id, user_id, start_date, end_date
)
)
except Exception as e:
_handle_greenlet_error(e, "index_slack_messages", connector_id)
raise
finally:
loop.close()
async def _index_slack_messages(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Slack messages with new session."""
from app.routes.search_source_connectors_routes import (
run_slack_indexing,
)
async with get_celery_session_maker()() as session:
await run_slack_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_notion_pages", bind=True)
def index_notion_pages_task(
self,
@ -174,92 +128,6 @@ async def _index_github_repos(
)
@celery_app.task(name="index_linear_issues", bind=True)
def index_linear_issues_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Linear issues."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_linear_issues(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_linear_issues(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Linear issues with new session."""
from app.routes.search_source_connectors_routes import (
run_linear_indexing,
)
async with get_celery_session_maker()() as session:
await run_linear_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_jira_issues", bind=True)
def index_jira_issues_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Jira issues."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_jira_issues(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_jira_issues(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Jira issues with new session."""
from app.routes.search_source_connectors_routes import (
run_jira_indexing,
)
async with get_celery_session_maker()() as session:
await run_jira_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_confluence_pages", bind=True)
def index_confluence_pages_task(
self,
@ -303,49 +171,6 @@ async def _index_confluence_pages(
)
@celery_app.task(name="index_clickup_tasks", bind=True)
def index_clickup_tasks_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index ClickUp tasks."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_clickup_tasks(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_clickup_tasks(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index ClickUp tasks with new session."""
from app.routes.search_source_connectors_routes import (
run_clickup_indexing,
)
async with get_celery_session_maker()() as session:
await run_clickup_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_google_calendar_events", bind=True)
def index_google_calendar_events_task(
self,
@ -392,49 +217,6 @@ async def _index_google_calendar_events(
)
@celery_app.task(name="index_airtable_records", bind=True)
def index_airtable_records_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Airtable records."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_airtable_records(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_airtable_records(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Airtable records with new session."""
from app.routes.search_source_connectors_routes import (
run_airtable_indexing,
)
async with get_celery_session_maker()() as session:
await run_airtable_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_google_gmail_messages", bind=True)
def index_google_gmail_messages_task(
self,
@ -622,135 +404,6 @@ async def _index_dropbox_files(
)
@celery_app.task(name="index_discord_messages", bind=True)
def index_discord_messages_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Discord messages."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_discord_messages(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_discord_messages(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Discord messages with new session."""
from app.routes.search_source_connectors_routes import (
run_discord_indexing,
)
async with get_celery_session_maker()() as session:
await run_discord_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_teams_messages", bind=True)
def index_teams_messages_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Microsoft Teams messages."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_teams_messages(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_teams_messages(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Microsoft Teams messages with new session."""
from app.routes.search_source_connectors_routes import (
run_teams_indexing,
)
async with get_celery_session_maker()() as session:
await run_teams_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_luma_events", bind=True)
def index_luma_events_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Luma events."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_luma_events(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_luma_events(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Luma events with new session."""
from app.routes.search_source_connectors_routes import (
run_luma_indexing,
)
async with get_celery_session_maker()() as session:
await run_luma_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
@celery_app.task(name="index_elasticsearch_documents", bind=True)
def index_elasticsearch_documents_task(
self,

View file

@ -77,8 +77,32 @@ async def _check_and_trigger_schedules():
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR: index_google_calendar_events_task,
}
_LIVE_CONNECTOR_TYPES = {
SearchSourceConnectorType.SLACK_CONNECTOR,
SearchSourceConnectorType.TEAMS_CONNECTOR,
SearchSourceConnectorType.LINEAR_CONNECTOR,
SearchSourceConnectorType.JIRA_CONNECTOR,
SearchSourceConnectorType.CLICKUP_CONNECTOR,
SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
SearchSourceConnectorType.AIRTABLE_CONNECTOR,
SearchSourceConnectorType.GOOGLE_GMAIL_CONNECTOR,
SearchSourceConnectorType.DISCORD_CONNECTOR,
SearchSourceConnectorType.LUMA_CONNECTOR,
}
# Trigger indexing for each due connector
for connector in due_connectors:
if connector.connector_type in _LIVE_CONNECTOR_TYPES:
connector.periodic_indexing_enabled = False
connector.next_scheduled_at = None
await session.commit()
logger.info(
"Disabled obsolete periodic indexing for live connector %s (%s)",
connector.id,
connector.connector_type.value,
)
continue
# Primary guard: Redis lock indicates a task is currently running.
if is_connector_indexing_locked(connector.id):
logger.info(

View file

@ -1,77 +1,31 @@
"""
Connector indexers module for background tasks.
This module provides a collection of connector indexers for different platforms
and services. Each indexer is responsible for handling the indexing of content
from a specific connector type.
Available indexers:
- Slack: Index messages from Slack channels
- Notion: Index pages from Notion workspaces
- GitHub: Index repositories and files from GitHub
- Linear: Index issues from Linear workspaces
- Jira: Index issues from Jira projects
- Confluence: Index pages from Confluence spaces
- BookStack: Index pages from BookStack wiki instances
- Discord: Index messages from Discord servers
- ClickUp: Index tasks from ClickUp workspaces
- Google Gmail: Index messages from Google Gmail
- Google Calendar: Index events from Google Calendar
- Luma: Index events from Luma
- Webcrawler: Index crawled URLs
- Elasticsearch: Index documents from Elasticsearch instances
Each indexer handles content indexing from a specific connector type.
Live connectors (Slack, Linear, Jira, ClickUp, Airtable, Discord, Teams,
Luma) now use real-time agent tools instead of background indexing.
"""
# Communication platforms
# Calendar and scheduling
from .airtable_indexer import index_airtable_records
from .bookstack_indexer import index_bookstack_pages
# Note: composio_indexer is imported directly in connector_tasks.py to avoid circular imports
from .clickup_indexer import index_clickup_tasks
from .confluence_indexer import index_confluence_pages
from .discord_indexer import index_discord_messages
# Development platforms
from .elasticsearch_indexer import index_elasticsearch_documents
from .github_indexer import index_github_repos
from .google_calendar_indexer import index_google_calendar_events
from .google_drive_indexer import index_google_drive_files
from .google_gmail_indexer import index_google_gmail_messages
from .jira_indexer import index_jira_issues
# Issue tracking and project management
from .linear_indexer import index_linear_issues
# Documentation and knowledge management
from .luma_indexer import index_luma_events
from .notion_indexer import index_notion_pages
from .obsidian_indexer import index_obsidian_vault
from .slack_indexer import index_slack_messages
from .webcrawler_indexer import index_crawled_urls
__all__ = [ # noqa: RUF022
"index_airtable_records",
__all__ = [
"index_bookstack_pages",
# "index_composio_connector", # Imported directly in connector_tasks.py to avoid circular imports
"index_clickup_tasks",
"index_confluence_pages",
"index_discord_messages",
# Development platforms
"index_elasticsearch_documents",
"index_github_repos",
# Calendar and scheduling
"index_google_calendar_events",
"index_google_drive_files",
"index_luma_events",
"index_jira_issues",
# Issue tracking and project management
"index_linear_issues",
# Documentation and knowledge management
"index_google_gmail_messages",
"index_notion_pages",
"index_obsidian_vault",
"index_crawled_urls",
# Communication platforms
"index_slack_messages",
"index_google_gmail_messages",
]