mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
fix: skip webcrawler indexing gracefully when no URLs configured
This commit is contained in:
parent
4f7ed8439f
commit
b20fbaca4b
5 changed files with 103 additions and 14 deletions
|
|
@ -187,6 +187,7 @@ async def create_search_source_connector(
|
||||||
user_id=str(user.id),
|
user_id=str(user.id),
|
||||||
connector_type=db_connector.connector_type,
|
connector_type=db_connector.connector_type,
|
||||||
frequency_minutes=db_connector.indexing_frequency_minutes,
|
frequency_minutes=db_connector.indexing_frequency_minutes,
|
||||||
|
connector_config=db_connector.config,
|
||||||
)
|
)
|
||||||
if not success:
|
if not success:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
|
@ -646,6 +647,7 @@ async def index_connector_content(
|
||||||
|
|
||||||
# Handle different connector types
|
# Handle different connector types
|
||||||
response_message = ""
|
response_message = ""
|
||||||
|
indexing_started = True
|
||||||
# Use UTC for consistency with last_indexed_at storage
|
# Use UTC for consistency with last_indexed_at storage
|
||||||
today_str = datetime.now(UTC).strftime("%Y-%m-%d")
|
today_str = datetime.now(UTC).strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
|
@ -921,14 +923,27 @@ async def index_connector_content(
|
||||||
|
|
||||||
elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
|
elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
|
||||||
from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task
|
from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task
|
||||||
|
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
||||||
|
|
||||||
logger.info(
|
# Check if URLs are configured before triggering indexing
|
||||||
f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
|
connector_config = connector.config or {}
|
||||||
)
|
urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS"))
|
||||||
index_crawled_urls_task.delay(
|
|
||||||
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
|
if not urls:
|
||||||
)
|
# URLs are optional - skip indexing gracefully
|
||||||
response_message = "Web page indexing started in the background."
|
logger.info(
|
||||||
|
f"Webcrawler connector {connector_id} has no URLs configured, skipping indexing"
|
||||||
|
)
|
||||||
|
response_message = "No URLs configured for this connector. Add URLs in the connector settings to enable indexing."
|
||||||
|
indexing_started = False
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
|
||||||
|
)
|
||||||
|
index_crawled_urls_task.delay(
|
||||||
|
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
|
||||||
|
)
|
||||||
|
response_message = "Web page indexing started in the background."
|
||||||
|
|
||||||
elif connector.connector_type == SearchSourceConnectorType.OBSIDIAN_CONNECTOR:
|
elif connector.connector_type == SearchSourceConnectorType.OBSIDIAN_CONNECTOR:
|
||||||
from app.config import config as app_config
|
from app.config import config as app_config
|
||||||
|
|
@ -1025,6 +1040,7 @@ async def index_connector_content(
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"message": response_message,
|
"message": response_message,
|
||||||
|
"indexing_started": indexing_started,
|
||||||
"connector_id": connector_id,
|
"connector_id": connector_id,
|
||||||
"search_space_id": search_space_id,
|
"search_space_id": search_space_id,
|
||||||
"indexing_from": indexing_from,
|
"indexing_from": indexing_from,
|
||||||
|
|
|
||||||
|
|
@ -156,6 +156,39 @@ async def _check_and_trigger_schedules():
|
||||||
)
|
)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Special handling for Webcrawler - skip if no URLs configured
|
||||||
|
elif (
|
||||||
|
connector.connector_type
|
||||||
|
== SearchSourceConnectorType.WEBCRAWLER_CONNECTOR
|
||||||
|
):
|
||||||
|
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
||||||
|
|
||||||
|
connector_config = connector.config or {}
|
||||||
|
urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS"))
|
||||||
|
|
||||||
|
if urls:
|
||||||
|
task.delay(
|
||||||
|
connector.id,
|
||||||
|
connector.search_space_id,
|
||||||
|
str(connector.user_id),
|
||||||
|
None, # start_date
|
||||||
|
None, # end_date
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# No URLs configured - skip indexing but still update next_scheduled_at
|
||||||
|
logger.info(
|
||||||
|
f"Webcrawler connector {connector.id} has no URLs configured, "
|
||||||
|
"skipping periodic indexing (will check again at next scheduled time)"
|
||||||
|
)
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
connector.next_scheduled_at = now + timedelta(
|
||||||
|
minutes=connector.indexing_frequency_minutes
|
||||||
|
)
|
||||||
|
await session.commit()
|
||||||
|
continue
|
||||||
|
|
||||||
else:
|
else:
|
||||||
task.delay(
|
task.delay(
|
||||||
connector.id,
|
connector.id,
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,7 @@ from app.utils.document_converters import (
|
||||||
generate_document_summary,
|
generate_document_summary,
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_document_by_unique_identifier,
|
check_document_by_unique_identifier,
|
||||||
|
|
@ -97,13 +98,7 @@ async def index_crawled_urls(
|
||||||
api_key = connector.config.get("FIRECRAWL_API_KEY")
|
api_key = connector.config.get("FIRECRAWL_API_KEY")
|
||||||
|
|
||||||
# Get URLs from connector config
|
# Get URLs from connector config
|
||||||
initial_urls = connector.config.get("INITIAL_URLS", "")
|
urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS"))
|
||||||
if isinstance(initial_urls, str):
|
|
||||||
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
|
|
||||||
elif isinstance(initial_urls, list):
|
|
||||||
urls = [url.strip() for url in initial_urls if url.strip()]
|
|
||||||
else:
|
|
||||||
urls = []
|
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
|
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,7 @@ def create_periodic_schedule(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
connector_type: SearchSourceConnectorType,
|
connector_type: SearchSourceConnectorType,
|
||||||
frequency_minutes: int,
|
frequency_minutes: int,
|
||||||
|
connector_config: dict | None = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Trigger the first indexing run immediately when periodic indexing is enabled.
|
Trigger the first indexing run immediately when periodic indexing is enabled.
|
||||||
|
|
@ -57,11 +58,26 @@ def create_periodic_schedule(
|
||||||
user_id: User ID
|
user_id: User ID
|
||||||
connector_type: Type of connector
|
connector_type: Type of connector
|
||||||
frequency_minutes: Frequency in minutes (used for logging)
|
frequency_minutes: Frequency in minutes (used for logging)
|
||||||
|
connector_config: Optional connector config dict for validation
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
True if successful, False otherwise
|
True if successful, False otherwise
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
# Special handling for connectors that require config validation
|
||||||
|
if connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
|
||||||
|
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
||||||
|
|
||||||
|
config = connector_config or {}
|
||||||
|
urls = parse_webcrawler_urls(config.get("INITIAL_URLS"))
|
||||||
|
|
||||||
|
if not urls:
|
||||||
|
logger.info(
|
||||||
|
f"Webcrawler connector {connector_id} has no URLs configured, "
|
||||||
|
"skipping first indexing run (will run when URLs are added)"
|
||||||
|
)
|
||||||
|
return True # Return success - schedule is created, just no first run
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Periodic indexing enabled for connector {connector_id} "
|
f"Periodic indexing enabled for connector {connector_id} "
|
||||||
f"(frequency: {frequency_minutes} minutes). Triggering first run..."
|
f"(frequency: {frequency_minutes} minutes). Triggering first run..."
|
||||||
|
|
|
||||||
29
surfsense_backend/app/utils/webcrawler_utils.py
Normal file
29
surfsense_backend/app/utils/webcrawler_utils.py
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
"""
|
||||||
|
Utility functions for webcrawler connector.
|
||||||
|
|
||||||
|
This module is intentionally kept separate from the connector_indexers package
|
||||||
|
to avoid circular import issues.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def parse_webcrawler_urls(initial_urls: str | list | None) -> list[str]:
|
||||||
|
"""
|
||||||
|
Parse URLs from webcrawler INITIAL_URLS value.
|
||||||
|
|
||||||
|
Handles both string (newline-separated) and list formats.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
initial_urls: The INITIAL_URLS value (string, list, or None)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of parsed, stripped, non-empty URLs
|
||||||
|
"""
|
||||||
|
if initial_urls is None:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if isinstance(initial_urls, str):
|
||||||
|
return [url.strip() for url in initial_urls.split("\n") if url.strip()]
|
||||||
|
elif isinstance(initial_urls, list):
|
||||||
|
return [url.strip() for url in initial_urls if url.strip()]
|
||||||
|
else:
|
||||||
|
return []
|
||||||
Loading…
Add table
Add a link
Reference in a new issue