From 9d0721de43e18d2a7b6ba610397de194dbd2600a Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sat, 27 Dec 2025 13:58:00 -0800 Subject: [PATCH] feat: Replace AsyncChromiumLoader with Playwright for web crawling and content extraction in link preview and web crawler connector modules. --- .../app/agents/new_chat/tools/link_preview.py | 25 +++++++-------- .../app/connectors/webcrawler_connector.py | 32 +++++++++++-------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/surfsense_backend/app/agents/new_chat/tools/link_preview.py b/surfsense_backend/app/agents/new_chat/tools/link_preview.py index 13f8a1f1a..3e2070a14 100644 --- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py +++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py @@ -14,8 +14,8 @@ from urllib.parse import urlparse import httpx import trafilatura from fake_useragent import UserAgent -from langchain_community.document_loaders import AsyncChromiumLoader from langchain_core.tools import tool +from playwright.async_api import async_playwright logger = logging.getLogger(__name__) @@ -170,7 +170,7 @@ def _make_absolute_url(image_url: str, base_url: str) -> str: async def fetch_with_chromium(url: str) -> dict[str, Any] | None: """ - Fetch page content using headless Chromium browser. + Fetch page content using headless Chromium browser via Playwright. Used as a fallback when simple HTTP requests are blocked (403, etc.). Args: @@ -186,18 +186,17 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None: ua = UserAgent() user_agent = ua.random - # Use AsyncChromiumLoader to fetch the page - crawl_loader = AsyncChromiumLoader( - urls=[url], headless=True, user_agent=user_agent - ) - documents = await crawl_loader.aload() + # Use Playwright to fetch the page + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context(user_agent=user_agent) + page = await context.new_page() - if not documents: - logger.warning(f"[link_preview] Chromium returned no documents for {url}") - return None - - doc = documents[0] - raw_html = doc.page_content + try: + await page.goto(url, wait_until="domcontentloaded", timeout=30000) + raw_html = await page.content() + finally: + await browser.close() if not raw_html or len(raw_html.strip()) == 0: logger.warning(f"[link_preview] Chromium returned empty content for {url}") diff --git a/surfsense_backend/app/connectors/webcrawler_connector.py b/surfsense_backend/app/connectors/webcrawler_connector.py index 7ffc66644..5d6ea98c8 100644 --- a/surfsense_backend/app/connectors/webcrawler_connector.py +++ b/surfsense_backend/app/connectors/webcrawler_connector.py @@ -1,7 +1,7 @@ """ WebCrawler Connector Module -A module for crawling web pages and extracting content using Firecrawl or AsyncChromiumLoader. +A module for crawling web pages and extracting content using Firecrawl or Playwright. Provides a unified interface for web scraping. """ @@ -12,7 +12,7 @@ import trafilatura import validators from fake_useragent import UserAgent from firecrawl import AsyncFirecrawlApp -from langchain_community.document_loaders import AsyncChromiumLoader +from playwright.async_api import async_playwright logger = logging.getLogger(__name__) @@ -149,7 +149,7 @@ class WebCrawlerConnector: async def _crawl_with_chromium(self, url: str) -> dict[str, Any]: """ - Crawl URL using AsyncChromiumLoader with Trafilatura for content extraction. + Crawl URL using Playwright with Trafilatura for content extraction. Falls back to raw HTML if Trafilatura extraction fails. Args: @@ -165,20 +165,24 @@ class WebCrawlerConnector: ua = UserAgent() user_agent = ua.random - # Pass User-Agent to AsyncChromiumLoader - crawl_loader = AsyncChromiumLoader( - urls=[url], headless=True, user_agent=user_agent - ) - documents = await crawl_loader.aload() + # Use Playwright to fetch the page + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context(user_agent=user_agent) + page = await context.new_page() - if not documents: + try: + await page.goto(url, wait_until="domcontentloaded", timeout=30000) + raw_html = await page.content() + page_title = await page.title() + finally: + await browser.close() + + if not raw_html: raise ValueError(f"Failed to load content from {url}") - doc = documents[0] - raw_html = doc.page_content - - # Extract basic metadata from the document - base_metadata = doc.metadata if doc.metadata else {} + # Extract basic metadata from the page + base_metadata = {"title": page_title} if page_title else {} # Try to extract main content using Trafilatura extracted_content = None