fix: skip webcrawler indexing gracefully when no URLs configured

This commit is contained in:
CREDO23 2026-01-28 17:54:46 +02:00
parent 4f7ed8439f
commit b20fbaca4b
5 changed files with 103 additions and 14 deletions

View file

@ -43,6 +43,7 @@ def create_periodic_schedule(
user_id: str,
connector_type: SearchSourceConnectorType,
frequency_minutes: int,
connector_config: dict | None = None,
) -> bool:
"""
Trigger the first indexing run immediately when periodic indexing is enabled.
@ -57,11 +58,26 @@ def create_periodic_schedule(
user_id: User ID
connector_type: Type of connector
frequency_minutes: Frequency in minutes (used for logging)
connector_config: Optional connector config dict for validation
Returns:
True if successful, False otherwise
"""
try:
# Special handling for connectors that require config validation
if connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
from app.utils.webcrawler_utils import parse_webcrawler_urls
config = connector_config or {}
urls = parse_webcrawler_urls(config.get("INITIAL_URLS"))
if not urls:
logger.info(
f"Webcrawler connector {connector_id} has no URLs configured, "
"skipping first indexing run (will run when URLs are added)"
)
return True # Return success - schedule is created, just no first run
logger.info(
f"Periodic indexing enabled for connector {connector_id} "
f"(frequency: {frequency_minutes} minutes). Triggering first run..."

View file

@ -0,0 +1,29 @@
"""
Utility functions for webcrawler connector.
This module is intentionally kept separate from the connector_indexers package
to avoid circular import issues.
"""
def parse_webcrawler_urls(initial_urls: str | list | None) -> list[str]:
"""
Parse URLs from webcrawler INITIAL_URLS value.
Handles both string (newline-separated) and list formats.
Args:
initial_urls: The INITIAL_URLS value (string, list, or None)
Returns:
List of parsed, stripped, non-empty URLs
"""
if initial_urls is None:
return []
if isinstance(initial_urls, str):
return [url.strip() for url in initial_urls.split("\n") if url.strip()]
elif isinstance(initial_urls, list):
return [url.strip() for url in initial_urls if url.strip()]
else:
return []