mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
refactor: enhance web crawling functionality with Firecrawl integration
- Updated WebCrawlerConnector to prioritize Firecrawl API for crawling if an API key is provided, falling back to Chromium if Firecrawl fails. - Improved error handling to log failures from both Firecrawl and Chromium. - Enhanced link preview tool to use a random User-Agent for better compatibility with web servers. - Passed Firecrawl API key to the stream_new_chat function for improved configuration management.
This commit is contained in:
parent
ad5a49c2c6
commit
d9df63f57e
3 changed files with 41 additions and 26 deletions
|
|
@ -280,15 +280,18 @@ def create_link_preview_tool():
|
|||
url = f"https://{url}"
|
||||
|
||||
try:
|
||||
# Generate a random User-Agent to avoid bot detection
|
||||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
# Use a browser-like User-Agent to fetch Open Graph metadata.
|
||||
# This is the same approach used by Slack, Discord, Twitter, etc. for link previews.
|
||||
# We're only fetching publicly available metadata (title, description, thumbnail)
|
||||
# that websites intentionally expose via OG tags for link preview purposes.
|
||||
async with httpx.AsyncClient(
|
||||
timeout=10.0,
|
||||
follow_redirects=True,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
|
|
|
|||
|
|
@ -25,7 +25,9 @@ class WebCrawlerConnector:
|
|||
Initialize the WebCrawlerConnector class.
|
||||
|
||||
Args:
|
||||
firecrawl_api_key: Firecrawl API key (optional, will use AsyncChromiumLoader if not provided)
|
||||
firecrawl_api_key: Firecrawl API key (optional). If provided, Firecrawl will be tried first
|
||||
and Chromium will be used as fallback if Firecrawl fails. If not provided,
|
||||
Chromium will be used directly.
|
||||
"""
|
||||
self.firecrawl_api_key = firecrawl_api_key
|
||||
self.use_firecrawl = bool(firecrawl_api_key)
|
||||
|
|
@ -46,6 +48,9 @@ class WebCrawlerConnector:
|
|||
"""
|
||||
Crawl a single URL and extract its content.
|
||||
|
||||
If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium
|
||||
if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly.
|
||||
|
||||
Args:
|
||||
url: URL to crawl
|
||||
formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl
|
||||
|
|
@ -56,19 +61,32 @@ class WebCrawlerConnector:
|
|||
- content: Extracted content (markdown or HTML)
|
||||
- metadata: Page metadata (title, description, etc.)
|
||||
- source: Original URL
|
||||
- crawler_type: Type of crawler used
|
||||
- crawler_type: Type of crawler used ("firecrawl" or "chromium")
|
||||
"""
|
||||
try:
|
||||
# Validate URL
|
||||
if not validators.url(url):
|
||||
return None, f"Invalid URL: {url}"
|
||||
|
||||
# Try Firecrawl first if API key is provided
|
||||
if self.use_firecrawl:
|
||||
result = await self._crawl_with_firecrawl(url, formats)
|
||||
try:
|
||||
logger.info(f"[webcrawler] Using Firecrawl for: {url}")
|
||||
result = await self._crawl_with_firecrawl(url, formats)
|
||||
return result, None
|
||||
except Exception as firecrawl_error:
|
||||
# Firecrawl failed, fallback to Chromium
|
||||
logger.warning(f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}")
|
||||
try:
|
||||
result = await self._crawl_with_chromium(url)
|
||||
return result, None
|
||||
except Exception as chromium_error:
|
||||
return None, f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}"
|
||||
else:
|
||||
# No Firecrawl API key, use Chromium directly
|
||||
logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}")
|
||||
result = await self._crawl_with_chromium(url)
|
||||
|
||||
return result, None
|
||||
return result, None
|
||||
|
||||
except Exception as e:
|
||||
return None, f"Error crawling URL {url}: {e!s}"
|
||||
|
|
@ -162,10 +180,6 @@ class WebCrawlerConnector:
|
|||
trafilatura_metadata = None
|
||||
|
||||
try:
|
||||
logger.info(
|
||||
f"Attempting to extract main content from {url} using Trafilatura"
|
||||
)
|
||||
|
||||
# Extract main content as markdown
|
||||
extracted_content = trafilatura.extract(
|
||||
raw_html,
|
||||
|
|
@ -179,23 +193,10 @@ class WebCrawlerConnector:
|
|||
# Extract metadata using Trafilatura
|
||||
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
|
||||
|
||||
if extracted_content and len(extracted_content.strip()) > 0:
|
||||
logger.info(
|
||||
f"Successfully extracted main content from {url} using Trafilatura "
|
||||
f"({len(extracted_content)} chars vs {len(raw_html)} chars raw HTML)"
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Trafilatura extraction returned empty content for {url}, "
|
||||
"falling back to raw HTML"
|
||||
)
|
||||
if not extracted_content or len(extracted_content.strip()) == 0:
|
||||
extracted_content = None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Trafilatura extraction failed for {url}: {e}. "
|
||||
"Falling back to raw HTML"
|
||||
)
|
||||
except Exception:
|
||||
extracted_content = None
|
||||
|
||||
# Build metadata, preferring Trafilatura metadata when available
|
||||
|
|
|
|||
|
|
@ -146,6 +146,16 @@ async def stream_new_chat(
|
|||
# Create connector service
|
||||
connector_service = ConnectorService(session, search_space_id=search_space_id)
|
||||
|
||||
# Get Firecrawl API key from webcrawler connector if configured
|
||||
from app.db import SearchSourceConnectorType
|
||||
|
||||
firecrawl_api_key = None
|
||||
webcrawler_connector = await connector_service.get_connector_by_type(
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR, search_space_id
|
||||
)
|
||||
if webcrawler_connector and webcrawler_connector.config:
|
||||
firecrawl_api_key = webcrawler_connector.config.get("FIRECRAWL_API_KEY")
|
||||
|
||||
# Get the PostgreSQL checkpointer for persistent conversation memory
|
||||
checkpointer = await get_checkpointer()
|
||||
|
||||
|
|
@ -157,6 +167,7 @@ async def stream_new_chat(
|
|||
connector_service=connector_service,
|
||||
checkpointer=checkpointer,
|
||||
agent_config=agent_config, # Pass prompt configuration
|
||||
firecrawl_api_key=firecrawl_api_key, # Pass Firecrawl API key if configured
|
||||
)
|
||||
|
||||
# Build input with message history from frontend
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue