refactor: enhance web crawling functionality with Firecrawl integration

- Updated WebCrawlerConnector to prioritize Firecrawl API for crawling if an API key is provided, falling back to Chromium if Firecrawl fails.
- Improved error handling to log failures from both Firecrawl and Chromium.
- Enhanced link preview tool to use a random User-Agent for better compatibility with web servers.
- Passed Firecrawl API key to the stream_new_chat function for improved configuration management.
This commit is contained in:
Anish Sarkar 2025-12-26 02:37:20 +05:30
parent ad5a49c2c6
commit d9df63f57e
3 changed files with 41 additions and 26 deletions

View file

@ -280,15 +280,18 @@ def create_link_preview_tool():
url = f"https://{url}"
try:
# Generate a random User-Agent to avoid bot detection
ua = UserAgent()
user_agent = ua.random
# Use a browser-like User-Agent to fetch Open Graph metadata.
# This is the same approach used by Slack, Discord, Twitter, etc. for link previews.
# We're only fetching publicly available metadata (title, description, thumbnail)
# that websites intentionally expose via OG tags for link preview purposes.
async with httpx.AsyncClient(
timeout=10.0,
follow_redirects=True,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",

View file

@ -25,7 +25,9 @@ class WebCrawlerConnector:
Initialize the WebCrawlerConnector class.
Args:
firecrawl_api_key: Firecrawl API key (optional, will use AsyncChromiumLoader if not provided)
firecrawl_api_key: Firecrawl API key (optional). If provided, Firecrawl will be tried first
and Chromium will be used as fallback if Firecrawl fails. If not provided,
Chromium will be used directly.
"""
self.firecrawl_api_key = firecrawl_api_key
self.use_firecrawl = bool(firecrawl_api_key)
@ -46,6 +48,9 @@ class WebCrawlerConnector:
"""
Crawl a single URL and extract its content.
If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium
if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly.
Args:
url: URL to crawl
formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl
@ -56,19 +61,32 @@ class WebCrawlerConnector:
- content: Extracted content (markdown or HTML)
- metadata: Page metadata (title, description, etc.)
- source: Original URL
- crawler_type: Type of crawler used
- crawler_type: Type of crawler used ("firecrawl" or "chromium")
"""
try:
# Validate URL
if not validators.url(url):
return None, f"Invalid URL: {url}"
# Try Firecrawl first if API key is provided
if self.use_firecrawl:
result = await self._crawl_with_firecrawl(url, formats)
try:
logger.info(f"[webcrawler] Using Firecrawl for: {url}")
result = await self._crawl_with_firecrawl(url, formats)
return result, None
except Exception as firecrawl_error:
# Firecrawl failed, fallback to Chromium
logger.warning(f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}")
try:
result = await self._crawl_with_chromium(url)
return result, None
except Exception as chromium_error:
return None, f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}"
else:
# No Firecrawl API key, use Chromium directly
logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}")
result = await self._crawl_with_chromium(url)
return result, None
return result, None
except Exception as e:
return None, f"Error crawling URL {url}: {e!s}"
@ -162,10 +180,6 @@ class WebCrawlerConnector:
trafilatura_metadata = None
try:
logger.info(
f"Attempting to extract main content from {url} using Trafilatura"
)
# Extract main content as markdown
extracted_content = trafilatura.extract(
raw_html,
@ -179,23 +193,10 @@ class WebCrawlerConnector:
# Extract metadata using Trafilatura
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
if extracted_content and len(extracted_content.strip()) > 0:
logger.info(
f"Successfully extracted main content from {url} using Trafilatura "
f"({len(extracted_content)} chars vs {len(raw_html)} chars raw HTML)"
)
else:
logger.warning(
f"Trafilatura extraction returned empty content for {url}, "
"falling back to raw HTML"
)
if not extracted_content or len(extracted_content.strip()) == 0:
extracted_content = None
except Exception as e:
logger.warning(
f"Trafilatura extraction failed for {url}: {e}. "
"Falling back to raw HTML"
)
except Exception:
extracted_content = None
# Build metadata, preferring Trafilatura metadata when available

View file

@ -146,6 +146,16 @@ async def stream_new_chat(
# Create connector service
connector_service = ConnectorService(session, search_space_id=search_space_id)
# Get Firecrawl API key from webcrawler connector if configured
from app.db import SearchSourceConnectorType
firecrawl_api_key = None
webcrawler_connector = await connector_service.get_connector_by_type(
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR, search_space_id
)
if webcrawler_connector and webcrawler_connector.config:
firecrawl_api_key = webcrawler_connector.config.get("FIRECRAWL_API_KEY")
# Get the PostgreSQL checkpointer for persistent conversation memory
checkpointer = await get_checkpointer()
@ -157,6 +167,7 @@ async def stream_new_chat(
connector_service=connector_service,
checkpointer=checkpointer,
agent_config=agent_config, # Pass prompt configuration
firecrawl_api_key=firecrawl_api_key, # Pass Firecrawl API key if configured
)
# Build input with message history from frontend