fix: skip webcrawler indexing gracefully when no URLs configured

2026-07-24 23:41:10 +02:00 · 2026-01-28 17:54:46 +02:00 · 2026-01-28 17:54:46 +02:00 · b20fbaca4b
commit b20fbaca4b
parent 4f7ed8439f
5 changed files with 103 additions and 14 deletions
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -187,6 +187,7 @@ async def create_search_source_connector(
                user_id=str(user.id),
                connector_type=db_connector.connector_type,
                frequency_minutes=db_connector.indexing_frequency_minutes,
+                connector_config=db_connector.config,
            )
            if not success:
                logger.warning(
@ -646,6 +647,7 @@ async def index_connector_content(

        # Handle different connector types
        response_message = ""
+        indexing_started = True
        # Use UTC for consistency with last_indexed_at storage
        today_str = datetime.now(UTC).strftime("%Y-%m-%d")

@ -921,14 +923,27 @@ async def index_connector_content(

        elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
            from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task
+            from app.utils.webcrawler_utils import parse_webcrawler_urls

-            logger.info(
-                f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
-            )
-            index_crawled_urls_task.delay(
-                connector_id, search_space_id, str(user.id), indexing_from, indexing_to
-            )
-            response_message = "Web page indexing started in the background."
+            # Check if URLs are configured before triggering indexing
+            connector_config = connector.config or {}
+            urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS"))
+
+            if not urls:
+                # URLs are optional - skip indexing gracefully
+                logger.info(
+                    f"Webcrawler connector {connector_id} has no URLs configured, skipping indexing"
+                )
+                response_message = "No URLs configured for this connector. Add URLs in the connector settings to enable indexing."
+                indexing_started = False
+            else:
+                logger.info(
+                    f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
+                )
+                index_crawled_urls_task.delay(
+                    connector_id, search_space_id, str(user.id), indexing_from, indexing_to
+                )
+                response_message = "Web page indexing started in the background."

        elif connector.connector_type == SearchSourceConnectorType.OBSIDIAN_CONNECTOR:
            from app.config import config as app_config
@ -1025,6 +1040,7 @@ async def index_connector_content(

        return {
            "message": response_message,
+            "indexing_started": indexing_started,
            "connector_id": connector_id,
            "search_space_id": search_space_id,
            "indexing_from": indexing_from,
--- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
@ -156,6 +156,39 @@ async def _check_and_trigger_schedules():
                            )
                            await session.commit()
                            continue
+
+                    # Special handling for Webcrawler - skip if no URLs configured
+                    elif (
+                        connector.connector_type
+                        == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR
+                    ):
+                        from app.utils.webcrawler_utils import parse_webcrawler_urls
+
+                        connector_config = connector.config or {}
+                        urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS"))
+
+                        if urls:
+                            task.delay(
+                                connector.id,
+                                connector.search_space_id,
+                                str(connector.user_id),
+                                None,  # start_date
+                                None,  # end_date
+                            )
+                        else:
+                            # No URLs configured - skip indexing but still update next_scheduled_at
+                            logger.info(
+                                f"Webcrawler connector {connector.id} has no URLs configured, "
+                                "skipping periodic indexing (will check again at next scheduled time)"
+                            )
+                            from datetime import timedelta
+
+                            connector.next_scheduled_at = now + timedelta(
+                                minutes=connector.indexing_frequency_minutes
+                            )
+                            await session.commit()
+                            continue
+
                    else:
                        task.delay(
                            connector.id,
--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@ -18,6 +18,7 @@ from app.utils.document_converters import (
    generate_document_summary,
    generate_unique_identifier_hash,
 )
+from app.utils.webcrawler_utils import parse_webcrawler_urls

 from .base import (
    check_document_by_unique_identifier,
@ -97,13 +98,7 @@ async def index_crawled_urls(
        api_key = connector.config.get("FIRECRAWL_API_KEY")

        # Get URLs from connector config
-        initial_urls = connector.config.get("INITIAL_URLS", "")
-        if isinstance(initial_urls, str):
-            urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
-        elif isinstance(initial_urls, list):
-            urls = [url.strip() for url in initial_urls if url.strip()]
-        else:
-            urls = []
+        urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS"))

        logger.info(
            f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
--- a/surfsense_backend/app/utils/periodic_scheduler.py
+++ b/surfsense_backend/app/utils/periodic_scheduler.py
@ -43,6 +43,7 @@ def create_periodic_schedule(
    user_id: str,
    connector_type: SearchSourceConnectorType,
    frequency_minutes: int,
+    connector_config: dict | None = None,
 ) -> bool:
    """
    Trigger the first indexing run immediately when periodic indexing is enabled.
@ -57,11 +58,26 @@ def create_periodic_schedule(
        user_id: User ID
        connector_type: Type of connector
        frequency_minutes: Frequency in minutes (used for logging)
+        connector_config: Optional connector config dict for validation

    Returns:
        True if successful, False otherwise
    """
    try:
+        # Special handling for connectors that require config validation
+        if connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
+            from app.utils.webcrawler_utils import parse_webcrawler_urls
+
+            config = connector_config or {}
+            urls = parse_webcrawler_urls(config.get("INITIAL_URLS"))
+
+            if not urls:
+                logger.info(
+                    f"Webcrawler connector {connector_id} has no URLs configured, "
+                    "skipping first indexing run (will run when URLs are added)"
+                )
+                return True  # Return success - schedule is created, just no first run
+
        logger.info(
            f"Periodic indexing enabled for connector {connector_id} "
            f"(frequency: {frequency_minutes} minutes). Triggering first run..."
--- a/surfsense_backend/app/utils/webcrawler_utils.py
+++ b/surfsense_backend/app/utils/webcrawler_utils.py
@ -0,0 +1,29 @@
+"""
+Utility functions for webcrawler connector.
+
+This module is intentionally kept separate from the connector_indexers package
+to avoid circular import issues.
+"""
+
+
+def parse_webcrawler_urls(initial_urls: str | list | None) -> list[str]:
+    """
+    Parse URLs from webcrawler INITIAL_URLS value.
+
+    Handles both string (newline-separated) and list formats.
+
+    Args:
+        initial_urls: The INITIAL_URLS value (string, list, or None)
+
+    Returns:
+        List of parsed, stripped, non-empty URLs
+    """
+    if initial_urls is None:
+        return []
+
+    if isinstance(initial_urls, str):
+        return [url.strip() for url in initial_urls.split("\n") if url.strip()]
+    elif isinstance(initial_urls, list):
+        return [url.strip() for url in initial_urls if url.strip()]
+    else:
+        return []