fix: skip webcrawler indexing gracefully when no URLs configured

This commit is contained in:
CREDO23 2026-01-28 17:54:46 +02:00
parent 4f7ed8439f
commit b20fbaca4b
5 changed files with 103 additions and 14 deletions

View file

@ -187,6 +187,7 @@ async def create_search_source_connector(
user_id=str(user.id), user_id=str(user.id),
connector_type=db_connector.connector_type, connector_type=db_connector.connector_type,
frequency_minutes=db_connector.indexing_frequency_minutes, frequency_minutes=db_connector.indexing_frequency_minutes,
connector_config=db_connector.config,
) )
if not success: if not success:
logger.warning( logger.warning(
@ -646,6 +647,7 @@ async def index_connector_content(
# Handle different connector types # Handle different connector types
response_message = "" response_message = ""
indexing_started = True
# Use UTC for consistency with last_indexed_at storage # Use UTC for consistency with last_indexed_at storage
today_str = datetime.now(UTC).strftime("%Y-%m-%d") today_str = datetime.now(UTC).strftime("%Y-%m-%d")
@ -921,14 +923,27 @@ async def index_connector_content(
elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task
from app.utils.webcrawler_utils import parse_webcrawler_urls
logger.info( # Check if URLs are configured before triggering indexing
f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" connector_config = connector.config or {}
) urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS"))
index_crawled_urls_task.delay(
connector_id, search_space_id, str(user.id), indexing_from, indexing_to if not urls:
) # URLs are optional - skip indexing gracefully
response_message = "Web page indexing started in the background." logger.info(
f"Webcrawler connector {connector_id} has no URLs configured, skipping indexing"
)
response_message = "No URLs configured for this connector. Add URLs in the connector settings to enable indexing."
indexing_started = False
else:
logger.info(
f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
)
index_crawled_urls_task.delay(
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
)
response_message = "Web page indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.OBSIDIAN_CONNECTOR: elif connector.connector_type == SearchSourceConnectorType.OBSIDIAN_CONNECTOR:
from app.config import config as app_config from app.config import config as app_config
@ -1025,6 +1040,7 @@ async def index_connector_content(
return { return {
"message": response_message, "message": response_message,
"indexing_started": indexing_started,
"connector_id": connector_id, "connector_id": connector_id,
"search_space_id": search_space_id, "search_space_id": search_space_id,
"indexing_from": indexing_from, "indexing_from": indexing_from,

View file

@ -156,6 +156,39 @@ async def _check_and_trigger_schedules():
) )
await session.commit() await session.commit()
continue continue
# Special handling for Webcrawler - skip if no URLs configured
elif (
connector.connector_type
== SearchSourceConnectorType.WEBCRAWLER_CONNECTOR
):
from app.utils.webcrawler_utils import parse_webcrawler_urls
connector_config = connector.config or {}
urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS"))
if urls:
task.delay(
connector.id,
connector.search_space_id,
str(connector.user_id),
None, # start_date
None, # end_date
)
else:
# No URLs configured - skip indexing but still update next_scheduled_at
logger.info(
f"Webcrawler connector {connector.id} has no URLs configured, "
"skipping periodic indexing (will check again at next scheduled time)"
)
from datetime import timedelta
connector.next_scheduled_at = now + timedelta(
minutes=connector.indexing_frequency_minutes
)
await session.commit()
continue
else: else:
task.delay( task.delay(
connector.id, connector.id,

View file

@ -18,6 +18,7 @@ from app.utils.document_converters import (
generate_document_summary, generate_document_summary,
generate_unique_identifier_hash, generate_unique_identifier_hash,
) )
from app.utils.webcrawler_utils import parse_webcrawler_urls
from .base import ( from .base import (
check_document_by_unique_identifier, check_document_by_unique_identifier,
@ -97,13 +98,7 @@ async def index_crawled_urls(
api_key = connector.config.get("FIRECRAWL_API_KEY") api_key = connector.config.get("FIRECRAWL_API_KEY")
# Get URLs from connector config # Get URLs from connector config
initial_urls = connector.config.get("INITIAL_URLS", "") urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS"))
if isinstance(initial_urls, str):
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
elif isinstance(initial_urls, list):
urls = [url.strip() for url in initial_urls if url.strip()]
else:
urls = []
logger.info( logger.info(
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs" f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"

View file

@ -43,6 +43,7 @@ def create_periodic_schedule(
user_id: str, user_id: str,
connector_type: SearchSourceConnectorType, connector_type: SearchSourceConnectorType,
frequency_minutes: int, frequency_minutes: int,
connector_config: dict | None = None,
) -> bool: ) -> bool:
""" """
Trigger the first indexing run immediately when periodic indexing is enabled. Trigger the first indexing run immediately when periodic indexing is enabled.
@ -57,11 +58,26 @@ def create_periodic_schedule(
user_id: User ID user_id: User ID
connector_type: Type of connector connector_type: Type of connector
frequency_minutes: Frequency in minutes (used for logging) frequency_minutes: Frequency in minutes (used for logging)
connector_config: Optional connector config dict for validation
Returns: Returns:
True if successful, False otherwise True if successful, False otherwise
""" """
try: try:
# Special handling for connectors that require config validation
if connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
from app.utils.webcrawler_utils import parse_webcrawler_urls
config = connector_config or {}
urls = parse_webcrawler_urls(config.get("INITIAL_URLS"))
if not urls:
logger.info(
f"Webcrawler connector {connector_id} has no URLs configured, "
"skipping first indexing run (will run when URLs are added)"
)
return True # Return success - schedule is created, just no first run
logger.info( logger.info(
f"Periodic indexing enabled for connector {connector_id} " f"Periodic indexing enabled for connector {connector_id} "
f"(frequency: {frequency_minutes} minutes). Triggering first run..." f"(frequency: {frequency_minutes} minutes). Triggering first run..."

View file

@ -0,0 +1,29 @@
"""
Utility functions for webcrawler connector.
This module is intentionally kept separate from the connector_indexers package
to avoid circular import issues.
"""
def parse_webcrawler_urls(initial_urls: str | list | None) -> list[str]:
"""
Parse URLs from webcrawler INITIAL_URLS value.
Handles both string (newline-separated) and list formats.
Args:
initial_urls: The INITIAL_URLS value (string, list, or None)
Returns:
List of parsed, stripped, non-empty URLs
"""
if initial_urls is None:
return []
if isinstance(initial_urls, str):
return [url.strip() for url in initial_urls.split("\n") if url.strip()]
elif isinstance(initial_urls, list):
return [url.strip() for url in initial_urls if url.strip()]
else:
return []