SurfSense/surfsense_backend/app/celery_app.py

"""Celery application configuration and setup."""

import os

from celery import Celery
from celery.schedules import crontab
from celery.signals import worker_process_init
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


@worker_process_init.connect
def init_worker(**kwargs):
    """Initialize the LLM Router and Image Gen Router when a Celery worker process starts.

    This ensures the Auto mode (LiteLLM Router) is available for background tasks
    like document summarization and image generation.
    """
    from app.config import initialize_image_gen_router, initialize_llm_router

    initialize_llm_router()
    initialize_image_gen_router()


# Get Celery configuration from environment
CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
CELERY_TASK_DEFAULT_QUEUE = os.getenv("CELERY_TASK_DEFAULT_QUEUE", "surfsense")

# Get schedule checker interval from environment
# Format: "<number><unit>" where unit is 'm' (minutes) or 'h' (hours)
# Examples: "1m" (every minute), "5m" (every 5 minutes), "1h" (every hour)
SCHEDULE_CHECKER_INTERVAL = os.getenv("SCHEDULE_CHECKER_INTERVAL", "2m")


def parse_schedule_interval(interval: str) -> dict:
    """Parse interval string into crontab parameters.

    Args:
        interval: String like "1m", "5m", "1h", etc.

    Returns:
        Dict with crontab parameters (minute, hour)
    """
    interval = interval.strip().lower()

    # Extract number and unit
    if interval.endswith("m") or interval.endswith("min"):
        # Minutes
        num = int(interval.rstrip("min"))
        if num == 1:
            return {"minute": "*", "hour": "*"}
        else:
            return {"minute": f"*/{num}", "hour": "*"}
    elif interval.endswith("h") or interval.endswith("hour"):
        # Hours
        num = int(interval.rstrip("hour"))
        if num == 1:
            return {"minute": "0", "hour": "*"}
        else:
            return {"minute": "0", "hour": f"*/{num}"}
    else:
        # Default to every minute if parsing fails
        return {"minute": "*", "hour": "*"}


# Parse the schedule interval
schedule_params = parse_schedule_interval(SCHEDULE_CHECKER_INTERVAL)

# Create Celery app
celery_app = Celery(
    "surfsense",
    broker=CELERY_BROKER_URL,
    backend=CELERY_RESULT_BACKEND,
    include=[
        "app.tasks.celery_tasks.document_tasks",
        "app.tasks.celery_tasks.podcast_tasks",
        "app.tasks.celery_tasks.connector_tasks",
        "app.tasks.celery_tasks.schedule_checker_task",
        "app.tasks.celery_tasks.document_reindex_tasks",
        "app.tasks.celery_tasks.stale_notification_cleanup_task",
    ],
)

# ── Queue names ──────────────────────────────────────────────
# Default queue  : fast, user-facing tasks (file upload, podcast, reindex, …)
# Connectors queue: slow, long-running indexing tasks (Notion, Gmail, web crawl, …)
CONNECTORS_QUEUE = f"{CELERY_TASK_DEFAULT_QUEUE}.connectors"

# Celery configuration
celery_app.conf.update(
    # Task settings
    task_serializer="json",
    accept_content=["json"],
    result_serializer="json",
    timezone="UTC",
    enable_utc=True,
    task_default_queue=CELERY_TASK_DEFAULT_QUEUE,
    task_default_exchange=CELERY_TASK_DEFAULT_QUEUE,
    task_default_routing_key=CELERY_TASK_DEFAULT_QUEUE,
    # Task execution settings
    task_track_started=True,
    task_time_limit=28800,  # 8 hour hard limit
    task_soft_time_limit=28200,  # 7 hours 50 minutes soft limit
    # Result backend settings
    result_expires=86400,  # Results expire after 24 hours
    result_extended=True,
    # Worker settings
    worker_prefetch_multiplier=1,
    worker_max_tasks_per_child=1000,
    # Retry settings
    task_acks_late=True,
    task_reject_on_worker_lost=True,
    # Broker settings
    broker_connection_retry_on_startup=True,
    # Beat scheduler settings
    beat_max_loop_interval=60,  # Check every minute
    # ── Task routing ─────────────────────────────────────────
    # Route slow connector/indexing tasks to a dedicated queue so they
    # never block fast user-facing tasks (file uploads, podcasts, etc.)
    task_routes={
        # Connector indexing tasks → connectors queue
        "index_slack_messages": {"queue": CONNECTORS_QUEUE},
        "index_notion_pages": {"queue": CONNECTORS_QUEUE},
        "index_github_repos": {"queue": CONNECTORS_QUEUE},
        "index_linear_issues": {"queue": CONNECTORS_QUEUE},
        "index_jira_issues": {"queue": CONNECTORS_QUEUE},
        "index_confluence_pages": {"queue": CONNECTORS_QUEUE},
        "index_clickup_tasks": {"queue": CONNECTORS_QUEUE},
        "index_google_calendar_events": {"queue": CONNECTORS_QUEUE},
        "index_airtable_records": {"queue": CONNECTORS_QUEUE},
        "index_google_gmail_messages": {"queue": CONNECTORS_QUEUE},
        "index_google_drive_files": {"queue": CONNECTORS_QUEUE},
        "index_discord_messages": {"queue": CONNECTORS_QUEUE},
        "index_teams_messages": {"queue": CONNECTORS_QUEUE},
        "index_luma_events": {"queue": CONNECTORS_QUEUE},
        "index_elasticsearch_documents": {"queue": CONNECTORS_QUEUE},
        "index_crawled_urls": {"queue": CONNECTORS_QUEUE},
        "index_bookstack_pages": {"queue": CONNECTORS_QUEUE},
        "index_obsidian_vault": {"queue": CONNECTORS_QUEUE},
        "index_composio_connector": {"queue": CONNECTORS_QUEUE},
        # Everything else (document processing, podcasts, reindexing,
        # schedule checker, cleanup) stays on the default fast queue.
    },
)

# Configure Celery Beat schedule
# This uses a meta-scheduler pattern: instead of creating individual Beat schedules
# for each connector, we have ONE schedule that checks the database at the configured interval
# for connectors that need indexing. This provides dynamic scheduling without restarts.
celery_app.conf.beat_schedule = {
    "check-periodic-connector-schedules": {
        "task": "check_periodic_schedules",
        "schedule": crontab(**schedule_params),
        "options": {
            "expires": 30,  # Task expires after 30 seconds if not picked up
        },
    },
    # Cleanup stale connector indexing notifications every 5 minutes
    # This detects tasks that crashed or timed out without proper cleanup
    # and marks their notifications as failed so users don't see perpetual "syncing"
    "cleanup-stale-indexing-notifications": {
        "task": "cleanup_stale_indexing_notifications",
        "schedule": crontab(minute="*/5"),  # Every 5 minutes
        "options": {
            "expires": 60,  # Task expires after 60 seconds if not picked up
        },
    },
}