diff --git a/surfsense_backend/app/agents/new_chat/tools/link_preview.py b/surfsense_backend/app/agents/new_chat/tools/link_preview.py index 17e89345e..13f8a1f1a 100644 --- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py +++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py @@ -280,15 +280,18 @@ def create_link_preview_tool(): url = f"https://{url}" try: + # Generate a random User-Agent to avoid bot detection + ua = UserAgent() + user_agent = ua.random + # Use a browser-like User-Agent to fetch Open Graph metadata. - # This is the same approach used by Slack, Discord, Twitter, etc. for link previews. # We're only fetching publicly available metadata (title, description, thumbnail) # that websites intentionally expose via OG tags for link preview purposes. async with httpx.AsyncClient( timeout=10.0, follow_redirects=True, headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "User-Agent": user_agent, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", diff --git a/surfsense_backend/app/connectors/webcrawler_connector.py b/surfsense_backend/app/connectors/webcrawler_connector.py index 3fc61f0b5..411f99a51 100644 --- a/surfsense_backend/app/connectors/webcrawler_connector.py +++ b/surfsense_backend/app/connectors/webcrawler_connector.py @@ -25,7 +25,9 @@ class WebCrawlerConnector: Initialize the WebCrawlerConnector class. Args: - firecrawl_api_key: Firecrawl API key (optional, will use AsyncChromiumLoader if not provided) + firecrawl_api_key: Firecrawl API key (optional). If provided, Firecrawl will be tried first + and Chromium will be used as fallback if Firecrawl fails. If not provided, + Chromium will be used directly. """ self.firecrawl_api_key = firecrawl_api_key self.use_firecrawl = bool(firecrawl_api_key) @@ -46,6 +48,9 @@ class WebCrawlerConnector: """ Crawl a single URL and extract its content. + If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium + if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly. + Args: url: URL to crawl formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl @@ -56,19 +61,32 @@ class WebCrawlerConnector: - content: Extracted content (markdown or HTML) - metadata: Page metadata (title, description, etc.) - source: Original URL - - crawler_type: Type of crawler used + - crawler_type: Type of crawler used ("firecrawl" or "chromium") """ try: # Validate URL if not validators.url(url): return None, f"Invalid URL: {url}" + # Try Firecrawl first if API key is provided if self.use_firecrawl: - result = await self._crawl_with_firecrawl(url, formats) + try: + logger.info(f"[webcrawler] Using Firecrawl for: {url}") + result = await self._crawl_with_firecrawl(url, formats) + return result, None + except Exception as firecrawl_error: + # Firecrawl failed, fallback to Chromium + logger.warning(f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}") + try: + result = await self._crawl_with_chromium(url) + return result, None + except Exception as chromium_error: + return None, f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}" else: + # No Firecrawl API key, use Chromium directly + logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}") result = await self._crawl_with_chromium(url) - - return result, None + return result, None except Exception as e: return None, f"Error crawling URL {url}: {e!s}" @@ -162,10 +180,6 @@ class WebCrawlerConnector: trafilatura_metadata = None try: - logger.info( - f"Attempting to extract main content from {url} using Trafilatura" - ) - # Extract main content as markdown extracted_content = trafilatura.extract( raw_html, @@ -179,23 +193,10 @@ class WebCrawlerConnector: # Extract metadata using Trafilatura trafilatura_metadata = trafilatura.extract_metadata(raw_html) - if extracted_content and len(extracted_content.strip()) > 0: - logger.info( - f"Successfully extracted main content from {url} using Trafilatura " - f"({len(extracted_content)} chars vs {len(raw_html)} chars raw HTML)" - ) - else: - logger.warning( - f"Trafilatura extraction returned empty content for {url}, " - "falling back to raw HTML" - ) + if not extracted_content or len(extracted_content.strip()) == 0: extracted_content = None - except Exception as e: - logger.warning( - f"Trafilatura extraction failed for {url}: {e}. " - "Falling back to raw HTML" - ) + except Exception: extracted_content = None # Build metadata, preferring Trafilatura metadata when available diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index aff6fa32b..8b326e1d1 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -146,6 +146,16 @@ async def stream_new_chat( # Create connector service connector_service = ConnectorService(session, search_space_id=search_space_id) + # Get Firecrawl API key from webcrawler connector if configured + from app.db import SearchSourceConnectorType + + firecrawl_api_key = None + webcrawler_connector = await connector_service.get_connector_by_type( + SearchSourceConnectorType.WEBCRAWLER_CONNECTOR, search_space_id + ) + if webcrawler_connector and webcrawler_connector.config: + firecrawl_api_key = webcrawler_connector.config.get("FIRECRAWL_API_KEY") + # Get the PostgreSQL checkpointer for persistent conversation memory checkpointer = await get_checkpointer() @@ -157,6 +167,7 @@ async def stream_new_chat( connector_service=connector_service, checkpointer=checkpointer, agent_config=agent_config, # Pass prompt configuration + firecrawl_api_key=firecrawl_api_key, # Pass Firecrawl API key if configured ) # Build input with message history from frontend