mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 08:46:22 +02:00
feat: Replace AsyncChromiumLoader with Playwright for web crawling and content extraction in link preview and web crawler connector modules.
This commit is contained in:
parent
c2e6bf2018
commit
9d0721de43
2 changed files with 30 additions and 27 deletions
|
|
@ -14,8 +14,8 @@ from urllib.parse import urlparse
|
||||||
import httpx
|
import httpx
|
||||||
import trafilatura
|
import trafilatura
|
||||||
from fake_useragent import UserAgent
|
from fake_useragent import UserAgent
|
||||||
from langchain_community.document_loaders import AsyncChromiumLoader
|
|
||||||
from langchain_core.tools import tool
|
from langchain_core.tools import tool
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -170,7 +170,7 @@ def _make_absolute_url(image_url: str, base_url: str) -> str:
|
||||||
|
|
||||||
async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
|
async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
|
||||||
"""
|
"""
|
||||||
Fetch page content using headless Chromium browser.
|
Fetch page content using headless Chromium browser via Playwright.
|
||||||
Used as a fallback when simple HTTP requests are blocked (403, etc.).
|
Used as a fallback when simple HTTP requests are blocked (403, etc.).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -186,18 +186,17 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
|
||||||
ua = UserAgent()
|
ua = UserAgent()
|
||||||
user_agent = ua.random
|
user_agent = ua.random
|
||||||
|
|
||||||
# Use AsyncChromiumLoader to fetch the page
|
# Use Playwright to fetch the page
|
||||||
crawl_loader = AsyncChromiumLoader(
|
async with async_playwright() as p:
|
||||||
urls=[url], headless=True, user_agent=user_agent
|
browser = await p.chromium.launch(headless=True)
|
||||||
)
|
context = await browser.new_context(user_agent=user_agent)
|
||||||
documents = await crawl_loader.aload()
|
page = await context.new_page()
|
||||||
|
|
||||||
if not documents:
|
try:
|
||||||
logger.warning(f"[link_preview] Chromium returned no documents for {url}")
|
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||||
return None
|
raw_html = await page.content()
|
||||||
|
finally:
|
||||||
doc = documents[0]
|
await browser.close()
|
||||||
raw_html = doc.page_content
|
|
||||||
|
|
||||||
if not raw_html or len(raw_html.strip()) == 0:
|
if not raw_html or len(raw_html.strip()) == 0:
|
||||||
logger.warning(f"[link_preview] Chromium returned empty content for {url}")
|
logger.warning(f"[link_preview] Chromium returned empty content for {url}")
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
"""
|
"""
|
||||||
WebCrawler Connector Module
|
WebCrawler Connector Module
|
||||||
|
|
||||||
A module for crawling web pages and extracting content using Firecrawl or AsyncChromiumLoader.
|
A module for crawling web pages and extracting content using Firecrawl or Playwright.
|
||||||
Provides a unified interface for web scraping.
|
Provides a unified interface for web scraping.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -12,7 +12,7 @@ import trafilatura
|
||||||
import validators
|
import validators
|
||||||
from fake_useragent import UserAgent
|
from fake_useragent import UserAgent
|
||||||
from firecrawl import AsyncFirecrawlApp
|
from firecrawl import AsyncFirecrawlApp
|
||||||
from langchain_community.document_loaders import AsyncChromiumLoader
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -149,7 +149,7 @@ class WebCrawlerConnector:
|
||||||
|
|
||||||
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
|
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Crawl URL using AsyncChromiumLoader with Trafilatura for content extraction.
|
Crawl URL using Playwright with Trafilatura for content extraction.
|
||||||
Falls back to raw HTML if Trafilatura extraction fails.
|
Falls back to raw HTML if Trafilatura extraction fails.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -165,20 +165,24 @@ class WebCrawlerConnector:
|
||||||
ua = UserAgent()
|
ua = UserAgent()
|
||||||
user_agent = ua.random
|
user_agent = ua.random
|
||||||
|
|
||||||
# Pass User-Agent to AsyncChromiumLoader
|
# Use Playwright to fetch the page
|
||||||
crawl_loader = AsyncChromiumLoader(
|
async with async_playwright() as p:
|
||||||
urls=[url], headless=True, user_agent=user_agent
|
browser = await p.chromium.launch(headless=True)
|
||||||
)
|
context = await browser.new_context(user_agent=user_agent)
|
||||||
documents = await crawl_loader.aload()
|
page = await context.new_page()
|
||||||
|
|
||||||
if not documents:
|
try:
|
||||||
|
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||||
|
raw_html = await page.content()
|
||||||
|
page_title = await page.title()
|
||||||
|
finally:
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
if not raw_html:
|
||||||
raise ValueError(f"Failed to load content from {url}")
|
raise ValueError(f"Failed to load content from {url}")
|
||||||
|
|
||||||
doc = documents[0]
|
# Extract basic metadata from the page
|
||||||
raw_html = doc.page_content
|
base_metadata = {"title": page_title} if page_title else {}
|
||||||
|
|
||||||
# Extract basic metadata from the document
|
|
||||||
base_metadata = doc.metadata if doc.metadata else {}
|
|
||||||
|
|
||||||
# Try to extract main content using Trafilatura
|
# Try to extract main content using Trafilatura
|
||||||
extracted_content = None
|
extracted_content = None
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue