diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 191c6f954..6ba67fb69 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -187,6 +187,7 @@ async def create_search_source_connector( user_id=str(user.id), connector_type=db_connector.connector_type, frequency_minutes=db_connector.indexing_frequency_minutes, + connector_config=db_connector.config, ) if not success: logger.warning( @@ -646,6 +647,7 @@ async def index_connector_content( # Handle different connector types response_message = "" + indexing_started = True # Use UTC for consistency with last_indexed_at storage today_str = datetime.now(UTC).strftime("%Y-%m-%d") @@ -921,14 +923,27 @@ async def index_connector_content( elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task + from app.utils.webcrawler_utils import parse_webcrawler_urls - logger.info( - f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" - ) - index_crawled_urls_task.delay( - connector_id, search_space_id, str(user.id), indexing_from, indexing_to - ) - response_message = "Web page indexing started in the background." + # Check if URLs are configured before triggering indexing + connector_config = connector.config or {} + urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS")) + + if not urls: + # URLs are optional - skip indexing gracefully + logger.info( + f"Webcrawler connector {connector_id} has no URLs configured, skipping indexing" + ) + response_message = "No URLs configured for this connector. Add URLs in the connector settings to enable indexing." + indexing_started = False + else: + logger.info( + f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" + ) + index_crawled_urls_task.delay( + connector_id, search_space_id, str(user.id), indexing_from, indexing_to + ) + response_message = "Web page indexing started in the background." elif connector.connector_type == SearchSourceConnectorType.OBSIDIAN_CONNECTOR: from app.config import config as app_config @@ -1025,6 +1040,7 @@ async def index_connector_content( return { "message": response_message, + "indexing_started": indexing_started, "connector_id": connector_id, "search_space_id": search_space_id, "indexing_from": indexing_from, diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py index bf80cbe78..22d45af21 100644 --- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py @@ -156,6 +156,39 @@ async def _check_and_trigger_schedules(): ) await session.commit() continue + + # Special handling for Webcrawler - skip if no URLs configured + elif ( + connector.connector_type + == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR + ): + from app.utils.webcrawler_utils import parse_webcrawler_urls + + connector_config = connector.config or {} + urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS")) + + if urls: + task.delay( + connector.id, + connector.search_space_id, + str(connector.user_id), + None, # start_date + None, # end_date + ) + else: + # No URLs configured - skip indexing but still update next_scheduled_at + logger.info( + f"Webcrawler connector {connector.id} has no URLs configured, " + "skipping periodic indexing (will check again at next scheduled time)" + ) + from datetime import timedelta + + connector.next_scheduled_at = now + timedelta( + minutes=connector.indexing_frequency_minutes + ) + await session.commit() + continue + else: task.delay( connector.id, diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index 6ae070c06..0c63fd2f0 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -18,6 +18,7 @@ from app.utils.document_converters import ( generate_document_summary, generate_unique_identifier_hash, ) +from app.utils.webcrawler_utils import parse_webcrawler_urls from .base import ( check_document_by_unique_identifier, @@ -97,13 +98,7 @@ async def index_crawled_urls( api_key = connector.config.get("FIRECRAWL_API_KEY") # Get URLs from connector config - initial_urls = connector.config.get("INITIAL_URLS", "") - if isinstance(initial_urls, str): - urls = [url.strip() for url in initial_urls.split("\n") if url.strip()] - elif isinstance(initial_urls, list): - urls = [url.strip() for url in initial_urls if url.strip()] - else: - urls = [] + urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS")) logger.info( f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs" diff --git a/surfsense_backend/app/utils/periodic_scheduler.py b/surfsense_backend/app/utils/periodic_scheduler.py index 219641933..aa8c07ce4 100644 --- a/surfsense_backend/app/utils/periodic_scheduler.py +++ b/surfsense_backend/app/utils/periodic_scheduler.py @@ -43,6 +43,7 @@ def create_periodic_schedule( user_id: str, connector_type: SearchSourceConnectorType, frequency_minutes: int, + connector_config: dict | None = None, ) -> bool: """ Trigger the first indexing run immediately when periodic indexing is enabled. @@ -57,11 +58,26 @@ def create_periodic_schedule( user_id: User ID connector_type: Type of connector frequency_minutes: Frequency in minutes (used for logging) + connector_config: Optional connector config dict for validation Returns: True if successful, False otherwise """ try: + # Special handling for connectors that require config validation + if connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: + from app.utils.webcrawler_utils import parse_webcrawler_urls + + config = connector_config or {} + urls = parse_webcrawler_urls(config.get("INITIAL_URLS")) + + if not urls: + logger.info( + f"Webcrawler connector {connector_id} has no URLs configured, " + "skipping first indexing run (will run when URLs are added)" + ) + return True # Return success - schedule is created, just no first run + logger.info( f"Periodic indexing enabled for connector {connector_id} " f"(frequency: {frequency_minutes} minutes). Triggering first run..." diff --git a/surfsense_backend/app/utils/webcrawler_utils.py b/surfsense_backend/app/utils/webcrawler_utils.py new file mode 100644 index 000000000..d6baf6d73 --- /dev/null +++ b/surfsense_backend/app/utils/webcrawler_utils.py @@ -0,0 +1,29 @@ +""" +Utility functions for webcrawler connector. + +This module is intentionally kept separate from the connector_indexers package +to avoid circular import issues. +""" + + +def parse_webcrawler_urls(initial_urls: str | list | None) -> list[str]: + """ + Parse URLs from webcrawler INITIAL_URLS value. + + Handles both string (newline-separated) and list formats. + + Args: + initial_urls: The INITIAL_URLS value (string, list, or None) + + Returns: + List of parsed, stripped, non-empty URLs + """ + if initial_urls is None: + return [] + + if isinstance(initial_urls, str): + return [url.strip() for url in initial_urls.split("\n") if url.strip()] + elif isinstance(initial_urls, list): + return [url.strip() for url in initial_urls if url.strip()] + else: + return []