feat: Replace AsyncChromiumLoader with Playwright for web crawling and content extraction in link preview and web crawler connector modules.

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-12-27 13:58:00 -08:00
parent c2e6bf2018
commit 9d0721de43
2 changed files with 30 additions and 27 deletions

View file

@ -14,8 +14,8 @@ from urllib.parse import urlparse
import httpx import httpx
import trafilatura import trafilatura
from fake_useragent import UserAgent from fake_useragent import UserAgent
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.tools import tool from langchain_core.tools import tool
from playwright.async_api import async_playwright
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -170,7 +170,7 @@ def _make_absolute_url(image_url: str, base_url: str) -> str:
async def fetch_with_chromium(url: str) -> dict[str, Any] | None: async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
""" """
Fetch page content using headless Chromium browser. Fetch page content using headless Chromium browser via Playwright.
Used as a fallback when simple HTTP requests are blocked (403, etc.). Used as a fallback when simple HTTP requests are blocked (403, etc.).
Args: Args:
@ -186,18 +186,17 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
ua = UserAgent() ua = UserAgent()
user_agent = ua.random user_agent = ua.random
# Use AsyncChromiumLoader to fetch the page # Use Playwright to fetch the page
crawl_loader = AsyncChromiumLoader( async with async_playwright() as p:
urls=[url], headless=True, user_agent=user_agent browser = await p.chromium.launch(headless=True)
) context = await browser.new_context(user_agent=user_agent)
documents = await crawl_loader.aload() page = await context.new_page()
if not documents: try:
logger.warning(f"[link_preview] Chromium returned no documents for {url}") await page.goto(url, wait_until="domcontentloaded", timeout=30000)
return None raw_html = await page.content()
finally:
doc = documents[0] await browser.close()
raw_html = doc.page_content
if not raw_html or len(raw_html.strip()) == 0: if not raw_html or len(raw_html.strip()) == 0:
logger.warning(f"[link_preview] Chromium returned empty content for {url}") logger.warning(f"[link_preview] Chromium returned empty content for {url}")

View file

@ -1,7 +1,7 @@
""" """
WebCrawler Connector Module WebCrawler Connector Module
A module for crawling web pages and extracting content using Firecrawl or AsyncChromiumLoader. A module for crawling web pages and extracting content using Firecrawl or Playwright.
Provides a unified interface for web scraping. Provides a unified interface for web scraping.
""" """
@ -12,7 +12,7 @@ import trafilatura
import validators import validators
from fake_useragent import UserAgent from fake_useragent import UserAgent
from firecrawl import AsyncFirecrawlApp from firecrawl import AsyncFirecrawlApp
from langchain_community.document_loaders import AsyncChromiumLoader from playwright.async_api import async_playwright
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -149,7 +149,7 @@ class WebCrawlerConnector:
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]: async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
""" """
Crawl URL using AsyncChromiumLoader with Trafilatura for content extraction. Crawl URL using Playwright with Trafilatura for content extraction.
Falls back to raw HTML if Trafilatura extraction fails. Falls back to raw HTML if Trafilatura extraction fails.
Args: Args:
@ -165,20 +165,24 @@ class WebCrawlerConnector:
ua = UserAgent() ua = UserAgent()
user_agent = ua.random user_agent = ua.random
# Pass User-Agent to AsyncChromiumLoader # Use Playwright to fetch the page
crawl_loader = AsyncChromiumLoader( async with async_playwright() as p:
urls=[url], headless=True, user_agent=user_agent browser = await p.chromium.launch(headless=True)
) context = await browser.new_context(user_agent=user_agent)
documents = await crawl_loader.aload() page = await context.new_page()
if not documents: try:
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
raw_html = await page.content()
page_title = await page.title()
finally:
await browser.close()
if not raw_html:
raise ValueError(f"Failed to load content from {url}") raise ValueError(f"Failed to load content from {url}")
doc = documents[0] # Extract basic metadata from the page
raw_html = doc.page_content base_metadata = {"title": page_title} if page_title else {}
# Extract basic metadata from the document
base_metadata = doc.metadata if doc.metadata else {}
# Try to extract main content using Trafilatura # Try to extract main content using Trafilatura
extracted_content = None extracted_content = None