fix: skip webcrawler indexing gracefully when no URLs configured

2026-05-15 18:25:18 +02:00 · 2026-01-28 17:54:46 +02:00 · 2026-01-28 17:54:46 +02:00 · b20fbaca4b
commit b20fbaca4b
parent 4f7ed8439f
5 changed files with 103 additions and 14 deletions
--- a/surfsense_backend/app/utils/periodic_scheduler.py
+++ b/surfsense_backend/app/utils/periodic_scheduler.py
@ -43,6 +43,7 @@ def create_periodic_schedule(
    user_id: str,
    connector_type: SearchSourceConnectorType,
    frequency_minutes: int,
+    connector_config: dict | None = None,
 ) -> bool:
    """
    Trigger the first indexing run immediately when periodic indexing is enabled.
@ -57,11 +58,26 @@ def create_periodic_schedule(
        user_id: User ID
        connector_type: Type of connector
        frequency_minutes: Frequency in minutes (used for logging)
+        connector_config: Optional connector config dict for validation

    Returns:
        True if successful, False otherwise
    """
    try:
+        # Special handling for connectors that require config validation
+        if connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
+            from app.utils.webcrawler_utils import parse_webcrawler_urls
+
+            config = connector_config or {}
+            urls = parse_webcrawler_urls(config.get("INITIAL_URLS"))
+
+            if not urls:
+                logger.info(
+                    f"Webcrawler connector {connector_id} has no URLs configured, "
+                    "skipping first indexing run (will run when URLs are added)"
+                )
+                return True  # Return success - schedule is created, just no first run
+
        logger.info(
            f"Periodic indexing enabled for connector {connector_id} "
            f"(frequency: {frequency_minutes} minutes). Triggering first run..."
--- a/surfsense_backend/app/utils/webcrawler_utils.py
+++ b/surfsense_backend/app/utils/webcrawler_utils.py
@ -0,0 +1,29 @@
+"""
+Utility functions for webcrawler connector.
+
+This module is intentionally kept separate from the connector_indexers package
+to avoid circular import issues.
+"""
+
+
+def parse_webcrawler_urls(initial_urls: str | list | None) -> list[str]:
+    """
+    Parse URLs from webcrawler INITIAL_URLS value.
+
+    Handles both string (newline-separated) and list formats.
+
+    Args:
+        initial_urls: The INITIAL_URLS value (string, list, or None)
+
+    Returns:
+        List of parsed, stripped, non-empty URLs
+    """
+    if initial_urls is None:
+        return []
+
+    if isinstance(initial_urls, str):
+        return [url.strip() for url in initial_urls.split("\n") if url.strip()]
+    elif isinstance(initial_urls, list):
+        return [url.strip() for url in initial_urls if url.strip()]
+    else:
+        return []