mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 08:46:22 +02:00
171 lines
6.7 KiB
Python
171 lines
6.7 KiB
Python
"""Celery application configuration and setup."""
|
|
|
|
import os
|
|
|
|
from celery import Celery
|
|
from celery.schedules import crontab
|
|
from celery.signals import worker_process_init
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
|
|
@worker_process_init.connect
|
|
def init_worker(**kwargs):
|
|
"""Initialize the LLM Router and Image Gen Router when a Celery worker process starts.
|
|
|
|
This ensures the Auto mode (LiteLLM Router) is available for background tasks
|
|
like document summarization and image generation.
|
|
"""
|
|
from app.config import initialize_image_gen_router, initialize_llm_router
|
|
|
|
initialize_llm_router()
|
|
initialize_image_gen_router()
|
|
|
|
|
|
# Get Celery configuration from environment
|
|
CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
|
CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
|
|
CELERY_TASK_DEFAULT_QUEUE = os.getenv("CELERY_TASK_DEFAULT_QUEUE", "surfsense")
|
|
|
|
# Get schedule checker interval from environment
|
|
# Format: "<number><unit>" where unit is 'm' (minutes) or 'h' (hours)
|
|
# Examples: "1m" (every minute), "5m" (every 5 minutes), "1h" (every hour)
|
|
SCHEDULE_CHECKER_INTERVAL = os.getenv("SCHEDULE_CHECKER_INTERVAL", "2m")
|
|
|
|
|
|
def parse_schedule_interval(interval: str) -> dict:
|
|
"""Parse interval string into crontab parameters.
|
|
|
|
Args:
|
|
interval: String like "1m", "5m", "1h", etc.
|
|
|
|
Returns:
|
|
Dict with crontab parameters (minute, hour)
|
|
"""
|
|
interval = interval.strip().lower()
|
|
|
|
# Extract number and unit
|
|
if interval.endswith("m") or interval.endswith("min"):
|
|
# Minutes
|
|
num = int(interval.rstrip("min"))
|
|
if num == 1:
|
|
return {"minute": "*", "hour": "*"}
|
|
else:
|
|
return {"minute": f"*/{num}", "hour": "*"}
|
|
elif interval.endswith("h") or interval.endswith("hour"):
|
|
# Hours
|
|
num = int(interval.rstrip("hour"))
|
|
if num == 1:
|
|
return {"minute": "0", "hour": "*"}
|
|
else:
|
|
return {"minute": "0", "hour": f"*/{num}"}
|
|
else:
|
|
# Default to every minute if parsing fails
|
|
return {"minute": "*", "hour": "*"}
|
|
|
|
|
|
# Parse the schedule interval
|
|
schedule_params = parse_schedule_interval(SCHEDULE_CHECKER_INTERVAL)
|
|
|
|
# Create Celery app
|
|
celery_app = Celery(
|
|
"surfsense",
|
|
broker=CELERY_BROKER_URL,
|
|
backend=CELERY_RESULT_BACKEND,
|
|
include=[
|
|
"app.tasks.celery_tasks.document_tasks",
|
|
"app.tasks.celery_tasks.podcast_tasks",
|
|
"app.tasks.celery_tasks.connector_tasks",
|
|
"app.tasks.celery_tasks.schedule_checker_task",
|
|
"app.tasks.celery_tasks.document_reindex_tasks",
|
|
"app.tasks.celery_tasks.stale_notification_cleanup_task",
|
|
],
|
|
)
|
|
|
|
# ── Queue names ──────────────────────────────────────────────
|
|
# Default queue : fast, user-facing tasks (file upload, podcast, reindex, …)
|
|
# Connectors queue: slow, long-running indexing tasks (Notion, Gmail, web crawl, …)
|
|
CONNECTORS_QUEUE = f"{CELERY_TASK_DEFAULT_QUEUE}.connectors"
|
|
|
|
# Celery configuration
|
|
celery_app.conf.update(
|
|
# Task settings
|
|
task_serializer="json",
|
|
accept_content=["json"],
|
|
result_serializer="json",
|
|
timezone="UTC",
|
|
enable_utc=True,
|
|
task_default_queue=CELERY_TASK_DEFAULT_QUEUE,
|
|
task_default_exchange=CELERY_TASK_DEFAULT_QUEUE,
|
|
task_default_routing_key=CELERY_TASK_DEFAULT_QUEUE,
|
|
# Task execution settings
|
|
task_track_started=True,
|
|
task_time_limit=28800, # 8 hour hard limit
|
|
task_soft_time_limit=28200, # 7 hours 50 minutes soft limit
|
|
# Result backend settings
|
|
result_expires=86400, # Results expire after 24 hours
|
|
result_extended=True,
|
|
# Worker settings
|
|
worker_prefetch_multiplier=1,
|
|
worker_max_tasks_per_child=1000,
|
|
# Retry settings
|
|
task_acks_late=True,
|
|
task_reject_on_worker_lost=True,
|
|
# Broker settings
|
|
broker_connection_retry_on_startup=True,
|
|
# Beat scheduler settings
|
|
beat_max_loop_interval=60, # Check every minute
|
|
# ── Task routing ─────────────────────────────────────────
|
|
# Route slow connector/indexing tasks to a dedicated queue so they
|
|
# never block fast user-facing tasks (file uploads, podcasts, etc.)
|
|
task_routes={
|
|
# Connector indexing tasks → connectors queue
|
|
"index_slack_messages": {"queue": CONNECTORS_QUEUE},
|
|
"index_notion_pages": {"queue": CONNECTORS_QUEUE},
|
|
"index_github_repos": {"queue": CONNECTORS_QUEUE},
|
|
"index_linear_issues": {"queue": CONNECTORS_QUEUE},
|
|
"index_jira_issues": {"queue": CONNECTORS_QUEUE},
|
|
"index_confluence_pages": {"queue": CONNECTORS_QUEUE},
|
|
"index_clickup_tasks": {"queue": CONNECTORS_QUEUE},
|
|
"index_google_calendar_events": {"queue": CONNECTORS_QUEUE},
|
|
"index_airtable_records": {"queue": CONNECTORS_QUEUE},
|
|
"index_google_gmail_messages": {"queue": CONNECTORS_QUEUE},
|
|
"index_google_drive_files": {"queue": CONNECTORS_QUEUE},
|
|
"index_discord_messages": {"queue": CONNECTORS_QUEUE},
|
|
"index_teams_messages": {"queue": CONNECTORS_QUEUE},
|
|
"index_luma_events": {"queue": CONNECTORS_QUEUE},
|
|
"index_elasticsearch_documents": {"queue": CONNECTORS_QUEUE},
|
|
"index_crawled_urls": {"queue": CONNECTORS_QUEUE},
|
|
"index_bookstack_pages": {"queue": CONNECTORS_QUEUE},
|
|
"index_obsidian_vault": {"queue": CONNECTORS_QUEUE},
|
|
"index_composio_connector": {"queue": CONNECTORS_QUEUE},
|
|
# Everything else (document processing, podcasts, reindexing,
|
|
# schedule checker, cleanup) stays on the default fast queue.
|
|
},
|
|
)
|
|
|
|
# Configure Celery Beat schedule
|
|
# This uses a meta-scheduler pattern: instead of creating individual Beat schedules
|
|
# for each connector, we have ONE schedule that checks the database at the configured interval
|
|
# for connectors that need indexing. This provides dynamic scheduling without restarts.
|
|
celery_app.conf.beat_schedule = {
|
|
"check-periodic-connector-schedules": {
|
|
"task": "check_periodic_schedules",
|
|
"schedule": crontab(**schedule_params),
|
|
"options": {
|
|
"expires": 30, # Task expires after 30 seconds if not picked up
|
|
},
|
|
},
|
|
# Cleanup stale connector indexing notifications every 5 minutes
|
|
# This detects tasks that crashed or timed out without proper cleanup
|
|
# and marks their notifications as failed so users don't see perpetual "syncing"
|
|
"cleanup-stale-indexing-notifications": {
|
|
"task": "cleanup_stale_indexing_notifications",
|
|
"schedule": crontab(minute="*/5"), # Every 5 minutes
|
|
"options": {
|
|
"expires": 60, # Task expires after 60 seconds if not picked up
|
|
},
|
|
},
|
|
}
|