mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
refactor: enhance web crawling functionality with Firecrawl integration
- Updated WebCrawlerConnector to prioritize Firecrawl API for crawling if an API key is provided, falling back to Chromium if Firecrawl fails. - Improved error handling to log failures from both Firecrawl and Chromium. - Enhanced link preview tool to use a random User-Agent for better compatibility with web servers. - Passed Firecrawl API key to the stream_new_chat function for improved configuration management.
This commit is contained in:
parent
ad5a49c2c6
commit
d9df63f57e
3 changed files with 41 additions and 26 deletions
|
|
@ -280,15 +280,18 @@ def create_link_preview_tool():
|
||||||
url = f"https://{url}"
|
url = f"https://{url}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Generate a random User-Agent to avoid bot detection
|
||||||
|
ua = UserAgent()
|
||||||
|
user_agent = ua.random
|
||||||
|
|
||||||
# Use a browser-like User-Agent to fetch Open Graph metadata.
|
# Use a browser-like User-Agent to fetch Open Graph metadata.
|
||||||
# This is the same approach used by Slack, Discord, Twitter, etc. for link previews.
|
|
||||||
# We're only fetching publicly available metadata (title, description, thumbnail)
|
# We're only fetching publicly available metadata (title, description, thumbnail)
|
||||||
# that websites intentionally expose via OG tags for link preview purposes.
|
# that websites intentionally expose via OG tags for link preview purposes.
|
||||||
async with httpx.AsyncClient(
|
async with httpx.AsyncClient(
|
||||||
timeout=10.0,
|
timeout=10.0,
|
||||||
follow_redirects=True,
|
follow_redirects=True,
|
||||||
headers={
|
headers={
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
"User-Agent": user_agent,
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||||
"Accept-Language": "en-US,en;q=0.9",
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
"Accept-Encoding": "gzip, deflate, br",
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,9 @@ class WebCrawlerConnector:
|
||||||
Initialize the WebCrawlerConnector class.
|
Initialize the WebCrawlerConnector class.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
firecrawl_api_key: Firecrawl API key (optional, will use AsyncChromiumLoader if not provided)
|
firecrawl_api_key: Firecrawl API key (optional). If provided, Firecrawl will be tried first
|
||||||
|
and Chromium will be used as fallback if Firecrawl fails. If not provided,
|
||||||
|
Chromium will be used directly.
|
||||||
"""
|
"""
|
||||||
self.firecrawl_api_key = firecrawl_api_key
|
self.firecrawl_api_key = firecrawl_api_key
|
||||||
self.use_firecrawl = bool(firecrawl_api_key)
|
self.use_firecrawl = bool(firecrawl_api_key)
|
||||||
|
|
@ -46,6 +48,9 @@ class WebCrawlerConnector:
|
||||||
"""
|
"""
|
||||||
Crawl a single URL and extract its content.
|
Crawl a single URL and extract its content.
|
||||||
|
|
||||||
|
If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium
|
||||||
|
if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url: URL to crawl
|
url: URL to crawl
|
||||||
formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl
|
formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl
|
||||||
|
|
@ -56,19 +61,32 @@ class WebCrawlerConnector:
|
||||||
- content: Extracted content (markdown or HTML)
|
- content: Extracted content (markdown or HTML)
|
||||||
- metadata: Page metadata (title, description, etc.)
|
- metadata: Page metadata (title, description, etc.)
|
||||||
- source: Original URL
|
- source: Original URL
|
||||||
- crawler_type: Type of crawler used
|
- crawler_type: Type of crawler used ("firecrawl" or "chromium")
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Validate URL
|
# Validate URL
|
||||||
if not validators.url(url):
|
if not validators.url(url):
|
||||||
return None, f"Invalid URL: {url}"
|
return None, f"Invalid URL: {url}"
|
||||||
|
|
||||||
|
# Try Firecrawl first if API key is provided
|
||||||
if self.use_firecrawl:
|
if self.use_firecrawl:
|
||||||
result = await self._crawl_with_firecrawl(url, formats)
|
try:
|
||||||
|
logger.info(f"[webcrawler] Using Firecrawl for: {url}")
|
||||||
|
result = await self._crawl_with_firecrawl(url, formats)
|
||||||
|
return result, None
|
||||||
|
except Exception as firecrawl_error:
|
||||||
|
# Firecrawl failed, fallback to Chromium
|
||||||
|
logger.warning(f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}")
|
||||||
|
try:
|
||||||
|
result = await self._crawl_with_chromium(url)
|
||||||
|
return result, None
|
||||||
|
except Exception as chromium_error:
|
||||||
|
return None, f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}"
|
||||||
else:
|
else:
|
||||||
|
# No Firecrawl API key, use Chromium directly
|
||||||
|
logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}")
|
||||||
result = await self._crawl_with_chromium(url)
|
result = await self._crawl_with_chromium(url)
|
||||||
|
return result, None
|
||||||
return result, None
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return None, f"Error crawling URL {url}: {e!s}"
|
return None, f"Error crawling URL {url}: {e!s}"
|
||||||
|
|
@ -162,10 +180,6 @@ class WebCrawlerConnector:
|
||||||
trafilatura_metadata = None
|
trafilatura_metadata = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(
|
|
||||||
f"Attempting to extract main content from {url} using Trafilatura"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract main content as markdown
|
# Extract main content as markdown
|
||||||
extracted_content = trafilatura.extract(
|
extracted_content = trafilatura.extract(
|
||||||
raw_html,
|
raw_html,
|
||||||
|
|
@ -179,23 +193,10 @@ class WebCrawlerConnector:
|
||||||
# Extract metadata using Trafilatura
|
# Extract metadata using Trafilatura
|
||||||
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
|
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
|
||||||
|
|
||||||
if extracted_content and len(extracted_content.strip()) > 0:
|
if not extracted_content or len(extracted_content.strip()) == 0:
|
||||||
logger.info(
|
|
||||||
f"Successfully extracted main content from {url} using Trafilatura "
|
|
||||||
f"({len(extracted_content)} chars vs {len(raw_html)} chars raw HTML)"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
f"Trafilatura extraction returned empty content for {url}, "
|
|
||||||
"falling back to raw HTML"
|
|
||||||
)
|
|
||||||
extracted_content = None
|
extracted_content = None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception:
|
||||||
logger.warning(
|
|
||||||
f"Trafilatura extraction failed for {url}: {e}. "
|
|
||||||
"Falling back to raw HTML"
|
|
||||||
)
|
|
||||||
extracted_content = None
|
extracted_content = None
|
||||||
|
|
||||||
# Build metadata, preferring Trafilatura metadata when available
|
# Build metadata, preferring Trafilatura metadata when available
|
||||||
|
|
|
||||||
|
|
@ -146,6 +146,16 @@ async def stream_new_chat(
|
||||||
# Create connector service
|
# Create connector service
|
||||||
connector_service = ConnectorService(session, search_space_id=search_space_id)
|
connector_service = ConnectorService(session, search_space_id=search_space_id)
|
||||||
|
|
||||||
|
# Get Firecrawl API key from webcrawler connector if configured
|
||||||
|
from app.db import SearchSourceConnectorType
|
||||||
|
|
||||||
|
firecrawl_api_key = None
|
||||||
|
webcrawler_connector = await connector_service.get_connector_by_type(
|
||||||
|
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR, search_space_id
|
||||||
|
)
|
||||||
|
if webcrawler_connector and webcrawler_connector.config:
|
||||||
|
firecrawl_api_key = webcrawler_connector.config.get("FIRECRAWL_API_KEY")
|
||||||
|
|
||||||
# Get the PostgreSQL checkpointer for persistent conversation memory
|
# Get the PostgreSQL checkpointer for persistent conversation memory
|
||||||
checkpointer = await get_checkpointer()
|
checkpointer = await get_checkpointer()
|
||||||
|
|
||||||
|
|
@ -157,6 +167,7 @@ async def stream_new_chat(
|
||||||
connector_service=connector_service,
|
connector_service=connector_service,
|
||||||
checkpointer=checkpointer,
|
checkpointer=checkpointer,
|
||||||
agent_config=agent_config, # Pass prompt configuration
|
agent_config=agent_config, # Pass prompt configuration
|
||||||
|
firecrawl_api_key=firecrawl_api_key, # Pass Firecrawl API key if configured
|
||||||
)
|
)
|
||||||
|
|
||||||
# Build input with message history from frontend
|
# Build input with message history from frontend
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue