add web page scraping feature implemented by Playwright/Selenium

2026-07-05 16:02:14 +02:00 · 2023-07-24 00:18:36 +08:00 · 2023-07-24 00:18:36 +08:00 · ef279fdeb7
commit ef279fdeb7
parent cfd5749456
5 changed files with 326 additions and 24 deletions
--- a/metagpt/tools/init.py
+++ b/metagpt/tools/init.py
@ -15,3 +15,9 @@ class SearchEngineType(Enum):
    DIRECT_GOOGLE = auto()
    SERPER_GOOGLE = auto()
    CUSTOM_ENGINE = auto()
+
+
+class WebBrowserEngineType(Enum):
+    PLAYWRIGHT = auto()
+    SELENIUM = auto()
+    CUSTOM_ENGINE = auto()
--- a/metagpt/tools/web_browser_engine.py
+++ b/metagpt/tools/web_browser_engine.py
@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+from __future__ import annotations
+import asyncio
+import importlib
+
+from typing import Any, Callable, Coroutine, overload
+
+from metagpt.config import Config
+from metagpt.tools import WebBrowserEngineType
+from bs4 import BeautifulSoup
+
+
+class WebBrowserEngine:
+    def __init__(
+        self,
+        engine: WebBrowserEngineType | None = None,
+        run_func: Callable[..., Coroutine[Any, Any, str | list[str]]] | None = None,
+    ):
+        self.config = Config()
+        engine = engine or self.config.web_browser_engine
+
+        if engine == WebBrowserEngineType.PLAYWRIGHT:
+            web_browser_engine = importlib.import_module("metagpt.tools.web_browser_engine_playwright")
+            run_func = web_browser_engine.PlaywrightWrapper().run
+        elif engine == WebBrowserEngineType.SELENIUM:
+            web_browser_engine = importlib.import_module("metagpt.tools.web_browser_engine_selenium")
+            run_func = web_browser_engine.SeleniumWrapper().run
+        elif engine == WebBrowserEngineType.CUSTOM_ENGINE:
+            run_func = run_func
+        else:
+            raise NotImplementedError
+        self.run_func = run_func
+        self.engine = engine
+
+    @overload
+    async def run(self, url: str) -> str:
+        ...
+
+    @overload
+    async def run(self, url: str, *urls: str) -> list[str]:
+        ...
+
+    async def run(self, url: str, *urls: str) -> str | list[str]:
+        page = await self.run_func(url, *urls)
+        if isinstance(page, str):
+            return get_page_content(page)
+        return [get_page_content(i) for i in page]
+
+
+def get_page_content(page: str):
+    soup = BeautifulSoup(page, "html.parser")
+    return "\n".join(i.text.strip() for i in soup.find_all(["h1", "h2", "h3", "h4", "h5", "p", "pre"]))
+
+
+if __name__ == "__main__":
+    text = asyncio.run(WebBrowserEngine().run("https://fuzhi.ai/"))
+    print(text)
--- a/metagpt/tools/web_browser_engine_playwright.py
+++ b/metagpt/tools/web_browser_engine_playwright.py
@ -0,0 +1,122 @@
+#!/usr/bin/env python
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+import sys
+from typing import Literal
+from playwright.async_api import async_playwright
+from metagpt.config import Config
+from metagpt.logs import logger
+
+
+class PlaywrightWrapper:
+    """Wrapper around Playwright.
+
+    To use this module, you should have the ``playwright`` Python package installed and ensure
+    that the required browsers are also installed. You can download the necessary browser binaries
+    by running the command `playwright install` for the first time.
+    """
+
+    def __init__(
+        self,
+        browser_type: Literal["chromium", "firefox", "webkit"] | None = None,
+        launch_kwargs: dict | None = None,
+        **kwargs,
+    ) -> None:
+        config = Config()
+        self.config = config
+        if browser_type is None:
+            browser_type = config.playwright_browser_type
+        self.browser_type = browser_type
+        launch_kwargs = launch_kwargs or {}
+        if config.global_proxy and "proxy" not in launch_kwargs:
+            args = launch_kwargs.get("args", [])
+            if not any(str.startswith(i, "--proxy-server=") for i in args):
+                launch_kwargs["proxy"] = {"server": config.global_proxy}
+        self.launch_kwargs = launch_kwargs
+        context_kwargs = {}
+        if "ignore_https_errors" in kwargs:
+            context_kwargs["ignore_https_errors"] = kwargs["ignore_https_errors"]
+        self._context_kwargs = context_kwargs
+        self._has_run_precheck = False
+
+    async def run(self, url: str, *urls: str) -> str | list[str]:
+        async with async_playwright() as ap:
+            browser_type = getattr(ap, self.browser_type)
+            await self._run_precheck(browser_type)
+            browser = await browser_type.launch(**self.launch_kwargs)
+
+            async def _scrape(url):
+                context = await browser.new_context(**self._context_kwargs)
+                page = await context.new_page()
+                async with page:
+                    try:
+                        await page.goto(url)
+                        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+                        content = await page.content()
+                        return content
+                    except Exception as e:
+                        return f"Fail to load page content for {e}"
+
+            if urls:
+                return await asyncio.gather(_scrape(url), *(_scrape(i) for i in urls))
+            return await _scrape(url)
+
+    async def _run_precheck(self, browser_type):
+        if self._has_run_precheck:
+            return
+
+        executable_path = Path(browser_type.executable_path)
+        if not executable_path.exists() and "executable_path" not in self.launch_kwargs:
+            kwargs = {}
+            if self.config.global_proxy:
+                kwargs["env"] = {"ALL_PROXY": self.config.global_proxy}
+            await _install_browsers(self.browser_type, **kwargs)
+            if not executable_path.exists():
+                parts = executable_path.parts
+                available_paths = list(Path(*parts[:-3]).glob(f"{self.browser_type}-*"))
+                if available_paths:
+                    logger.warning(
+                        "It seems that your OS is not officially supported by Playwright. "
+                        "Try to set executable_path to the fallback build version."
+                    )
+                    executable_path = available_paths[0].joinpath(*parts[-2:])
+                    self.launch_kwargs["executable_path"] = str(executable_path)
+        self._has_run_precheck = True
+
+
+async def _install_browsers(*browsers, **kwargs) -> None:
+    process = await asyncio.create_subprocess_exec(
+        sys.executable,
+        "-m",
+        "playwright",
+        "install",
+        *browsers,
+        "--with-deps",
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        **kwargs,
+    )
+
+    await asyncio.gather(_log_stream(process.stdout, logger.info), _log_stream(process.stderr, logger.warning))
+
+    if await process.wait() == 0:
+        logger.info(f"Install browser for playwright successfully.")
+    else:
+        logger.warning(f"Fail to install browser for playwright.")
+
+
+async def _log_stream(sr, log_func):
+    while True:
+        line = await sr.readline()
+        if not line:
+            return
+        log_func(f"[playwright install browser]: {line.decode().strip()}")
+
+
+if __name__ == "__main__":
+    for i in ("chromium", "firefox", "webkit"):
+        text = asyncio.run(PlaywrightWrapper(i).run("https://httpbin.org/ip"))
+        print(text)
+        print(i)
--- a/metagpt/tools/web_browser_engine_selenium.py
+++ b/metagpt/tools/web_browser_engine_selenium.py
@ -0,0 +1,108 @@
+#!/usr/bin/env python
+from __future__ import annotations
+
+import asyncio
+from copy import deepcopy
+import importlib
+from typing import Literal
+from metagpt.config import Config
+import asyncio
+
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from concurrent import futures
+
+
+class SeleniumWrapper:
+    """Wrapper around Selenium.
+
+    To use this module, you should have the ``selenium`` Python package installed and ensure
+    that the required browsers are also installed.
+    """
+
+    def __init__(
+        self,
+        browser_type: Literal["chrome", "firefox", "edge", "ie"] | None = None,
+        launch_kwargs: dict | None = None,
+        *,
+        loop: asyncio.AbstractEventLoop | None = None,
+        executor: futures.Executor | None = None,
+    ) -> None:
+        config = Config()
+        self.config = config
+        if browser_type is None:
+            browser_type = config.selenium_browser_type
+        self.browser_type = browser_type
+        launch_kwargs = launch_kwargs or {}
+        if config.global_proxy and "proxy-server" not in launch_kwargs:
+            launch_kwargs["proxy-server"] = config.global_proxy
+
+        self.executable_path = launch_kwargs.pop("executable_path", None)
+        self.launch_args = [f"--{k}={v}" for k, v in launch_kwargs.items()]
+        self._has_run_precheck = False
+        self._get_driver = None
+        self.loop = loop
+        self.executor = executor
+
+    async def run(self, url: str, *urls: str) -> str | list[str]:
+        await self._run_precheck()
+
+        _scrape = lambda url: self.loop.run_in_executor(self.executor, self._scrape_website, url)
+
+        if urls:
+            return await asyncio.gather(_scrape(url), *(_scrape(i) for i in urls))
+        return await _scrape(url)
+
+    async def _run_precheck(self):
+        if self._has_run_precheck:
+            return
+        self.loop = self.loop or asyncio.get_event_loop()
+        self._get_driver = await self.loop.run_in_executor(
+            self.executor,
+            lambda: _gen_get_driver_func(self.browser_type, *self.launch_args, executable_path=self.executable_path),
+        )
+        self._has_run_precheck = True
+
+    def _scrape_website(self, url):
+        with self._get_driver() as driver:
+            driver.get(url)
+            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+            return driver.page_source
+
+
+_webdriver_manager_types = {
+    "chrome": ("webdriver_manager.chrome", "ChromeDriverManager"),
+    "firefox": ("webdriver_manager.firefox", "GeckoDriverManager"),
+    "edge": ("webdriver_manager.microsoft", "EdgeChromiumDriverManager"),
+    "ie": ("webdriver_manager.microsoft", "IEDriverManager"),
+}
+
+
+def _gen_get_driver_func(browser_type, *args, executable_path=None):
+    WebDriver = getattr(importlib.import_module(f"selenium.webdriver.{browser_type}.webdriver"), "WebDriver")
+    Service = getattr(importlib.import_module(f"selenium.webdriver.{browser_type}.service"), "Service")
+    Options = getattr(importlib.import_module(f"selenium.webdriver.{browser_type}.options"), "Options")
+
+    if not executable_path:
+        module_name, type_name = _webdriver_manager_types[browser_type]
+        DriverManager = getattr(importlib.import_module(module_name), type_name)
+        driver_manager = DriverManager()
+        # driver_manager.driver_cache.find_driver(driver_manager.driver))
+        executable_path = driver_manager.install()
+
+    def _get_driver():
+        options = Options()
+        options.add_argument("--headless")
+        if browser_type == "chrome":
+            options.add_argument("--no-sandbox")
+        for i in args:
+            options.add_argument(i)
+        return WebDriver(options=deepcopy(options), service=Service(executable_path=executable_path))
+
+    return _get_driver
+
+
+if __name__ == "__main__":
+    text = asyncio.run(SeleniumWrapper("chrome").run("https://fuzhi.ai/"))
+    print(text)