mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-04 22:02:16 +02:00
fix: skip webcrawler indexing gracefully when no URLs configured
This commit is contained in:
parent
4f7ed8439f
commit
b20fbaca4b
5 changed files with 103 additions and 14 deletions
|
|
@ -156,6 +156,39 @@ async def _check_and_trigger_schedules():
|
|||
)
|
||||
await session.commit()
|
||||
continue
|
||||
|
||||
# Special handling for Webcrawler - skip if no URLs configured
|
||||
elif (
|
||||
connector.connector_type
|
||||
== SearchSourceConnectorType.WEBCRAWLER_CONNECTOR
|
||||
):
|
||||
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
||||
|
||||
connector_config = connector.config or {}
|
||||
urls = parse_webcrawler_urls(connector_config.get("INITIAL_URLS"))
|
||||
|
||||
if urls:
|
||||
task.delay(
|
||||
connector.id,
|
||||
connector.search_space_id,
|
||||
str(connector.user_id),
|
||||
None, # start_date
|
||||
None, # end_date
|
||||
)
|
||||
else:
|
||||
# No URLs configured - skip indexing but still update next_scheduled_at
|
||||
logger.info(
|
||||
f"Webcrawler connector {connector.id} has no URLs configured, "
|
||||
"skipping periodic indexing (will check again at next scheduled time)"
|
||||
)
|
||||
from datetime import timedelta
|
||||
|
||||
connector.next_scheduled_at = now + timedelta(
|
||||
minutes=connector.indexing_frequency_minutes
|
||||
)
|
||||
await session.commit()
|
||||
continue
|
||||
|
||||
else:
|
||||
task.delay(
|
||||
connector.id,
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ from app.utils.document_converters import (
|
|||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
||||
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
|
|
@ -97,13 +98,7 @@ async def index_crawled_urls(
|
|||
api_key = connector.config.get("FIRECRAWL_API_KEY")
|
||||
|
||||
# Get URLs from connector config
|
||||
initial_urls = connector.config.get("INITIAL_URLS", "")
|
||||
if isinstance(initial_urls, str):
|
||||
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
|
||||
elif isinstance(initial_urls, list):
|
||||
urls = [url.strip() for url in initial_urls if url.strip()]
|
||||
else:
|
||||
urls = []
|
||||
urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS"))
|
||||
|
||||
logger.info(
|
||||
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue