From ef279fdeb7b4f8f6573bf2081feee9650682e8f7 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Mon, 24 Jul 2023 00:18:36 +0800 Subject: [PATCH 1/9] add web page scraping feature implemented by Playwright/Selenium --- metagpt/config.py | 56 ++++---- metagpt/tools/__init__.py | 6 + metagpt/tools/web_browser_engine.py | 58 +++++++++ .../tools/web_browser_engine_playwright.py | 122 ++++++++++++++++++ metagpt/tools/web_browser_engine_selenium.py | 108 ++++++++++++++++ 5 files changed, 326 insertions(+), 24 deletions(-) create mode 100644 metagpt/tools/web_browser_engine.py create mode 100644 metagpt/tools/web_browser_engine_playwright.py create mode 100644 metagpt/tools/web_browser_engine_selenium.py diff --git a/metagpt/config.py b/metagpt/config.py index 84867258e..47bf36df4 100644 --- a/metagpt/config.py +++ b/metagpt/config.py @@ -4,6 +4,7 @@ 提供配置,单例 """ import os +import openai import yaml @@ -11,7 +12,7 @@ from metagpt.logs import logger from metagpt.const import PROJECT_ROOT from metagpt.utils.singleton import Singleton -from metagpt.tools import SearchEngineType +from metagpt.tools import SearchEngineType, WebBrowserEngineType class NotConfiguredException(Exception): @@ -33,35 +34,42 @@ class Config(metaclass=Singleton): secret_key = config.get_key("MY_SECRET_KEY") print("Secret key:", secret_key) """ + _instance = None - key_yaml_file = PROJECT_ROOT / 'config/key.yaml' - default_yaml_file = PROJECT_ROOT / 'config/config.yaml' + key_yaml_file = PROJECT_ROOT / "config/key.yaml" + default_yaml_file = PROJECT_ROOT / "config/config.yaml" def __init__(self, yaml_file=default_yaml_file): self._configs = {} self._init_with_config_files_and_env(self._configs, yaml_file) - logger.info('Config loading done.') - self.openai_api_key = self._get('OPENAI_API_KEY') - if not self.openai_api_key or 'YOUR_API_KEY' == self.openai_api_key: + logger.info("Config loading done.") + self.global_proxy = self._get("GLOBAL_PROXY") + self.openai_api_key = self._get("OPENAI_API_KEY") + if not self.openai_api_key or "YOUR_API_KEY" == self.openai_api_key: raise NotConfiguredException("Set OPENAI_API_KEY first") - self.openai_api_base = self._get('OPENAI_API_BASE') - if not self.openai_api_base or 'YOUR_API_BASE' == self.openai_api_base: - logger.info("Set OPENAI_API_BASE in case of network issues") - self.openai_api_type = self._get('OPENAI_API_TYPE') - self.openai_api_version = self._get('OPENAI_API_VERSION') - self.openai_api_rpm = self._get('RPM', 3) - self.openai_api_model = self._get('OPENAI_API_MODEL', "gpt-4") - self.max_tokens_rsp = self._get('MAX_TOKENS', 2048) - self.deployment_id = self._get('DEPLOYMENT_ID') + self.openai_api_base = self._get("OPENAI_API_BASE") + if not self.openai_api_base or "YOUR_API_BASE" == self.openai_api_base: + openai_proxy = self._get("OPENAI_PROXY") or self.global_proxy + if openai_proxy: + openai.proxy = openai_proxy + else: + logger.info("Set OPENAI_API_BASE in case of network issues") + self.openai_api_type = self._get("OPENAI_API_TYPE") + self.openai_api_version = self._get("OPENAI_API_VERSION") + self.openai_api_rpm = self._get("RPM", 3) + self.openai_api_model = self._get("OPENAI_API_MODEL", "gpt-4") + self.max_tokens_rsp = self._get("MAX_TOKENS", 2048) + self.deployment_id = self._get("DEPLOYMENT_ID") - self.claude_api_key = self._get('Anthropic_API_KEY') - - self.serpapi_api_key = self._get('SERPAPI_API_KEY') - self.serper_api_key = self._get('SERPER_API_KEY') - self.google_api_key = self._get('GOOGLE_API_KEY') - self.google_cse_id = self._get('GOOGLE_CSE_ID') - self.search_engine = self._get('SEARCH_ENGINE', SearchEngineType.SERPAPI_GOOGLE) - self.max_budget = self._get('MAX_BUDGET', 10.0) + self.serpapi_api_key = self._get("SERPAPI_API_KEY") + self.serper_api_key = self._get("SERPER_API_KEY") + self.google_api_key = self._get("GOOGLE_API_KEY") + self.google_cse_id = self._get("GOOGLE_CSE_ID") + self.search_engine = self._get("SEARCH_ENGINE", SearchEngineType.SERPAPI_GOOGLE) + self.web_browser_engine = self._get("WEB_BROWSER_ENGINE", WebBrowserEngineType.PLAYWRIGHT) + self.playwright_browser_type = self._get("PLAYWRIGHT_BROWSER_TYPE", "chromium") + self.selenium_browser_type = self._get("selenium_browser_type", "chrome") + self.max_budget = self._get("MAX_BUDGET", 10.0) self.total_cost = 0.0 def _init_with_config_files_and_env(self, configs: dict, yaml_file): @@ -73,7 +81,7 @@ class Config(metaclass=Singleton): continue # 加载本地 YAML 文件 - with open(_yaml_file, 'r', encoding="utf-8") as file: + with open(_yaml_file, "r", encoding="utf-8") as file: yaml_data = yaml.safe_load(file) if not yaml_data: continue diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py index 46ee0a0a0..6f418baf4 100644 --- a/metagpt/tools/__init__.py +++ b/metagpt/tools/__init__.py @@ -15,3 +15,9 @@ class SearchEngineType(Enum): DIRECT_GOOGLE = auto() SERPER_GOOGLE = auto() CUSTOM_ENGINE = auto() + + +class WebBrowserEngineType(Enum): + PLAYWRIGHT = auto() + SELENIUM = auto() + CUSTOM_ENGINE = auto() diff --git a/metagpt/tools/web_browser_engine.py b/metagpt/tools/web_browser_engine.py new file mode 100644 index 000000000..f38b4bd8f --- /dev/null +++ b/metagpt/tools/web_browser_engine.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +from __future__ import annotations +import asyncio +import importlib + +from typing import Any, Callable, Coroutine, overload + +from metagpt.config import Config +from metagpt.tools import WebBrowserEngineType +from bs4 import BeautifulSoup + + +class WebBrowserEngine: + def __init__( + self, + engine: WebBrowserEngineType | None = None, + run_func: Callable[..., Coroutine[Any, Any, str | list[str]]] | None = None, + ): + self.config = Config() + engine = engine or self.config.web_browser_engine + + if engine == WebBrowserEngineType.PLAYWRIGHT: + web_browser_engine = importlib.import_module("metagpt.tools.web_browser_engine_playwright") + run_func = web_browser_engine.PlaywrightWrapper().run + elif engine == WebBrowserEngineType.SELENIUM: + web_browser_engine = importlib.import_module("metagpt.tools.web_browser_engine_selenium") + run_func = web_browser_engine.SeleniumWrapper().run + elif engine == WebBrowserEngineType.CUSTOM_ENGINE: + run_func = run_func + else: + raise NotImplementedError + self.run_func = run_func + self.engine = engine + + @overload + async def run(self, url: str) -> str: + ... + + @overload + async def run(self, url: str, *urls: str) -> list[str]: + ... + + async def run(self, url: str, *urls: str) -> str | list[str]: + page = await self.run_func(url, *urls) + if isinstance(page, str): + return get_page_content(page) + return [get_page_content(i) for i in page] + + +def get_page_content(page: str): + soup = BeautifulSoup(page, "html.parser") + return "\n".join(i.text.strip() for i in soup.find_all(["h1", "h2", "h3", "h4", "h5", "p", "pre"])) + + +if __name__ == "__main__": + text = asyncio.run(WebBrowserEngine().run("https://fuzhi.ai/")) + print(text) diff --git a/metagpt/tools/web_browser_engine_playwright.py b/metagpt/tools/web_browser_engine_playwright.py new file mode 100644 index 000000000..6546d7a29 --- /dev/null +++ b/metagpt/tools/web_browser_engine_playwright.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +from __future__ import annotations + +import asyncio +from pathlib import Path +import sys +from typing import Literal +from playwright.async_api import async_playwright +from metagpt.config import Config +from metagpt.logs import logger + + +class PlaywrightWrapper: + """Wrapper around Playwright. + + To use this module, you should have the ``playwright`` Python package installed and ensure + that the required browsers are also installed. You can download the necessary browser binaries + by running the command `playwright install` for the first time. + """ + + def __init__( + self, + browser_type: Literal["chromium", "firefox", "webkit"] | None = None, + launch_kwargs: dict | None = None, + **kwargs, + ) -> None: + config = Config() + self.config = config + if browser_type is None: + browser_type = config.playwright_browser_type + self.browser_type = browser_type + launch_kwargs = launch_kwargs or {} + if config.global_proxy and "proxy" not in launch_kwargs: + args = launch_kwargs.get("args", []) + if not any(str.startswith(i, "--proxy-server=") for i in args): + launch_kwargs["proxy"] = {"server": config.global_proxy} + self.launch_kwargs = launch_kwargs + context_kwargs = {} + if "ignore_https_errors" in kwargs: + context_kwargs["ignore_https_errors"] = kwargs["ignore_https_errors"] + self._context_kwargs = context_kwargs + self._has_run_precheck = False + + async def run(self, url: str, *urls: str) -> str | list[str]: + async with async_playwright() as ap: + browser_type = getattr(ap, self.browser_type) + await self._run_precheck(browser_type) + browser = await browser_type.launch(**self.launch_kwargs) + + async def _scrape(url): + context = await browser.new_context(**self._context_kwargs) + page = await context.new_page() + async with page: + try: + await page.goto(url) + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + content = await page.content() + return content + except Exception as e: + return f"Fail to load page content for {e}" + + if urls: + return await asyncio.gather(_scrape(url), *(_scrape(i) for i in urls)) + return await _scrape(url) + + async def _run_precheck(self, browser_type): + if self._has_run_precheck: + return + + executable_path = Path(browser_type.executable_path) + if not executable_path.exists() and "executable_path" not in self.launch_kwargs: + kwargs = {} + if self.config.global_proxy: + kwargs["env"] = {"ALL_PROXY": self.config.global_proxy} + await _install_browsers(self.browser_type, **kwargs) + if not executable_path.exists(): + parts = executable_path.parts + available_paths = list(Path(*parts[:-3]).glob(f"{self.browser_type}-*")) + if available_paths: + logger.warning( + "It seems that your OS is not officially supported by Playwright. " + "Try to set executable_path to the fallback build version." + ) + executable_path = available_paths[0].joinpath(*parts[-2:]) + self.launch_kwargs["executable_path"] = str(executable_path) + self._has_run_precheck = True + + +async def _install_browsers(*browsers, **kwargs) -> None: + process = await asyncio.create_subprocess_exec( + sys.executable, + "-m", + "playwright", + "install", + *browsers, + "--with-deps", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + **kwargs, + ) + + await asyncio.gather(_log_stream(process.stdout, logger.info), _log_stream(process.stderr, logger.warning)) + + if await process.wait() == 0: + logger.info(f"Install browser for playwright successfully.") + else: + logger.warning(f"Fail to install browser for playwright.") + + +async def _log_stream(sr, log_func): + while True: + line = await sr.readline() + if not line: + return + log_func(f"[playwright install browser]: {line.decode().strip()}") + + +if __name__ == "__main__": + for i in ("chromium", "firefox", "webkit"): + text = asyncio.run(PlaywrightWrapper(i).run("https://httpbin.org/ip")) + print(text) + print(i) diff --git a/metagpt/tools/web_browser_engine_selenium.py b/metagpt/tools/web_browser_engine_selenium.py new file mode 100644 index 000000000..d5d01daf2 --- /dev/null +++ b/metagpt/tools/web_browser_engine_selenium.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +from __future__ import annotations + +import asyncio +from copy import deepcopy +import importlib +from typing import Literal +from metagpt.config import Config +import asyncio + +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.wait import WebDriverWait +from concurrent import futures + + +class SeleniumWrapper: + """Wrapper around Selenium. + + To use this module, you should have the ``selenium`` Python package installed and ensure + that the required browsers are also installed. + """ + + def __init__( + self, + browser_type: Literal["chrome", "firefox", "edge", "ie"] | None = None, + launch_kwargs: dict | None = None, + *, + loop: asyncio.AbstractEventLoop | None = None, + executor: futures.Executor | None = None, + ) -> None: + config = Config() + self.config = config + if browser_type is None: + browser_type = config.selenium_browser_type + self.browser_type = browser_type + launch_kwargs = launch_kwargs or {} + if config.global_proxy and "proxy-server" not in launch_kwargs: + launch_kwargs["proxy-server"] = config.global_proxy + + self.executable_path = launch_kwargs.pop("executable_path", None) + self.launch_args = [f"--{k}={v}" for k, v in launch_kwargs.items()] + self._has_run_precheck = False + self._get_driver = None + self.loop = loop + self.executor = executor + + async def run(self, url: str, *urls: str) -> str | list[str]: + await self._run_precheck() + + _scrape = lambda url: self.loop.run_in_executor(self.executor, self._scrape_website, url) + + if urls: + return await asyncio.gather(_scrape(url), *(_scrape(i) for i in urls)) + return await _scrape(url) + + async def _run_precheck(self): + if self._has_run_precheck: + return + self.loop = self.loop or asyncio.get_event_loop() + self._get_driver = await self.loop.run_in_executor( + self.executor, + lambda: _gen_get_driver_func(self.browser_type, *self.launch_args, executable_path=self.executable_path), + ) + self._has_run_precheck = True + + def _scrape_website(self, url): + with self._get_driver() as driver: + driver.get(url) + WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) + return driver.page_source + + +_webdriver_manager_types = { + "chrome": ("webdriver_manager.chrome", "ChromeDriverManager"), + "firefox": ("webdriver_manager.firefox", "GeckoDriverManager"), + "edge": ("webdriver_manager.microsoft", "EdgeChromiumDriverManager"), + "ie": ("webdriver_manager.microsoft", "IEDriverManager"), +} + + +def _gen_get_driver_func(browser_type, *args, executable_path=None): + WebDriver = getattr(importlib.import_module(f"selenium.webdriver.{browser_type}.webdriver"), "WebDriver") + Service = getattr(importlib.import_module(f"selenium.webdriver.{browser_type}.service"), "Service") + Options = getattr(importlib.import_module(f"selenium.webdriver.{browser_type}.options"), "Options") + + if not executable_path: + module_name, type_name = _webdriver_manager_types[browser_type] + DriverManager = getattr(importlib.import_module(module_name), type_name) + driver_manager = DriverManager() + # driver_manager.driver_cache.find_driver(driver_manager.driver)) + executable_path = driver_manager.install() + + def _get_driver(): + options = Options() + options.add_argument("--headless") + if browser_type == "chrome": + options.add_argument("--no-sandbox") + for i in args: + options.add_argument(i) + return WebDriver(options=deepcopy(options), service=Service(executable_path=executable_path)) + + return _get_driver + + +if __name__ == "__main__": + text = asyncio.run(SeleniumWrapper("chrome").run("https://fuzhi.ai/")) + print(text) From e657f298f23d9a3d25162ecda8fb39f7e1e5e9e2 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Mon, 24 Jul 2023 00:19:59 +0800 Subject: [PATCH 2/9] add test for web browser engine --- tests/conftest.py | 32 ++++++++++++++++- .../metagpt/tools/test_web_browser_engine.py | 25 +++++++++++++ .../test_web_browser_engine_playwright.py | 35 +++++++++++++++++++ .../tools/test_web_browser_engine_selenium.py | 35 +++++++++++++++++++ 4 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 tests/metagpt/tools/test_web_browser_engine.py create mode 100644 tests/metagpt/tools/test_web_browser_engine_playwright.py create mode 100644 tests/metagpt/tools/test_web_browser_engine_selenium.py diff --git a/tests/conftest.py b/tests/conftest.py index b440426c5..eaf682feb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,6 +11,8 @@ import pytest from metagpt.logs import logger from metagpt.provider.openai_api import OpenAIGPTAPI as GPTAPI +import asyncio +import re class Context: @@ -36,4 +38,32 @@ def llm_api(): @pytest.fixture(scope="function") def mock_llm(): # Create a mock LLM for testing - return Mock() \ No newline at end of file + return Mock() + + +@pytest.fixture(scope="session") +def proxy(): + pattern = re.compile( + rb"(?P[a-zA-Z]+) (?P(\w+://)?(?P[^\s\'\"<>\[\]{}|/:]+)(:(?P\d+))?[^\s\'\"<>\[\]{}|]*) " + ) + + async def pipe(reader, writer): + while not reader.at_eof(): + writer.write(await reader.read(2048)) + writer.close() + + async def handle_client(reader, writer): + data = await reader.readuntil(b"\r\n\r\n") + print(f"Proxy: {data}") # checking with capfd fixture + infos = pattern.match(data) + host, port = infos.group("host"), infos.group("port") + port = int(port) if port else 80 + remote_reader, remote_writer = await asyncio.open_connection(host, port) + if data.startswith(b"CONNECT"): + writer.write(b"HTTP/1.1 200 Connection Established\r\n\r\n") + else: + remote_writer.write(data) + await asyncio.gather(pipe(reader, remote_writer), pipe(remote_reader, writer)) + + server = asyncio.get_event_loop().run_until_complete(asyncio.start_server(handle_client, "127.0.0.1", 0)) + return "http://{}:{}".format(*server.sockets[0].getsockname()) diff --git a/tests/metagpt/tools/test_web_browser_engine.py b/tests/metagpt/tools/test_web_browser_engine.py new file mode 100644 index 000000000..57335de9c --- /dev/null +++ b/tests/metagpt/tools/test_web_browser_engine.py @@ -0,0 +1,25 @@ +import pytest +from metagpt.config import Config +from metagpt.tools import web_browser_engine, WebBrowserEngineType + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "browser_type, url, urls", + [ + (WebBrowserEngineType.PLAYWRIGHT, "https://fuzhi.ai", ("https://fuzhi.ai",)), + (WebBrowserEngineType.SELENIUM, "https://fuzhi.ai", ("https://fuzhi.ai",)), + ], + ids=["playwright", "selenium"], +) +async def test_scrape_web_page(browser_type, url, urls): + browser = web_browser_engine.WebBrowserEngine(browser_type) + result = await browser.run(url) + assert isinstance(result, str) + assert "深度赋智" in result + + if urls: + results = await browser.run(url, *urls) + assert isinstance(results, list) + assert len(results) == len(urls) + 1 + assert all(("深度赋智" in i) for i in results) diff --git a/tests/metagpt/tools/test_web_browser_engine_playwright.py b/tests/metagpt/tools/test_web_browser_engine_playwright.py new file mode 100644 index 000000000..afca35d52 --- /dev/null +++ b/tests/metagpt/tools/test_web_browser_engine_playwright.py @@ -0,0 +1,35 @@ +import pytest +from metagpt.config import Config +from metagpt.tools import web_browser_engine_playwright + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "browser_type, use_proxy, kwagrs, url, urls", + [ + ("chromium", {"proxy": True}, {}, "https://fuzhi.ai", ("https://fuzhi.ai",)), + ("firefox", {}, {"ignore_https_errors": True}, "https://fuzhi.ai", ("https://fuzhi.ai",)), + ("webkit", {}, {"ignore_https_errors": True}, "https://fuzhi.ai", ("https://fuzhi.ai",)), + ], + ids=["chromium-normal", "firefox-normal", "webkit-normal"], +) +async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy, capfd): + try: + config = Config() + global_proxy = config.global_proxy + if use_proxy: + config.global_proxy = proxy + browser = web_browser_engine_playwright.PlaywrightWrapper(browser_type, **kwagrs) + result = await browser.run(url) + assert isinstance(result, str) + assert "Deepwisdom" in result + + if urls: + results = await browser.run(url, *urls) + assert isinstance(results, list) + assert len(results) == len(urls) + 1 + assert all(("Deepwisdom" in i) for i in results) + if use_proxy: + assert "Proxy:" in capfd.readouterr().out + finally: + config.global_proxy = global_proxy diff --git a/tests/metagpt/tools/test_web_browser_engine_selenium.py b/tests/metagpt/tools/test_web_browser_engine_selenium.py new file mode 100644 index 000000000..752938766 --- /dev/null +++ b/tests/metagpt/tools/test_web_browser_engine_selenium.py @@ -0,0 +1,35 @@ +import pytest +from metagpt.config import Config +from metagpt.tools import web_browser_engine_selenium + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "browser_type, use_proxy, url, urls", + [ + ("chrome", True, "https://fuzhi.ai", ("https://fuzhi.ai",)), + ("firefox", False, "https://fuzhi.ai", ("https://fuzhi.ai",)), + ("edge", False, "https://fuzhi.ai", ("https://fuzhi.ai",)), + ], + ids=["chrome-normal", "firefox-normal", "edge-normal"], +) +async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd): + try: + config = Config() + global_proxy = config.global_proxy + if use_proxy: + Config().global_proxy = proxy + browser = web_browser_engine_selenium.SeleniumWrapper(browser_type) + result = await browser.run(url) + assert isinstance(result, str) + assert "Deepwisdom" in result + + if urls: + results = await browser.run(url, *urls) + assert isinstance(results, list) + assert len(results) == len(urls) + 1 + assert all(("Deepwisdom" in i) for i in results) + if use_proxy: + assert "Proxy:" in capfd.readouterr().out + finally: + config.global_proxy = global_proxy From 6ac39c2afab9d4df9c07fe28924a1c999ff4e168 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Mon, 24 Jul 2023 00:22:38 +0800 Subject: [PATCH 3/9] update deps for web browser engine --- docs/ROADMAP.md | 4 ++-- requirements.txt | 3 +++ setup.py | 14 +++++++++++--- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 5c7557745..005a59ab2 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -58,8 +58,8 @@ ### Tasks 5. Plugins: Compatibility with plugin system 6. Tools 1. ~~Support SERPER api~~ - 2. Support Selenium apis - 3. Support Playwright apis + 2. ~~Support Selenium apis~~ + 3. ~~Support Playwright apis~~ 7. Roles 1. Perfect the action pool/skill pool for each role 2. Red Book blogger diff --git a/requirements.txt b/requirements.txt index bb53aedfe..b2eaaaf4c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,3 +29,6 @@ tenacity==8.2.2 tiktoken==0.3.3 tqdm==4.64.0 #unstructured[local-inference] +# playwright +# selenium>4 +# webdriver_manager<3.9 diff --git a/setup.py b/setup.py index 790c90650..6b3c2fb01 100644 --- a/setup.py +++ b/setup.py @@ -10,12 +10,12 @@ import subprocess class InstallMermaidCLI(Command): """A custom command to run `npm install -g @mermaid-js/mermaid-cli` via a subprocess.""" - description = 'install mermaid-cli' + description = "install mermaid-cli" user_options = [] def run(self): try: - subprocess.check_call(['npm', 'install', '-g', '@mermaid-js/mermaid-cli']) + subprocess.check_call(["npm", "install", "-g", "@mermaid-js/mermaid-cli"]) except subprocess.CalledProcessError as e: print(f"Error occurred: {e.output}") @@ -42,7 +42,15 @@ setup( packages=find_packages(exclude=["contrib", "docs", "examples"]), python_requires=">=3.9", install_requires=requirements, + extras_require={ + "web_browser": [ + "playwright>=1.26", + "beautifulsoup4", + ], + "playwright": ["playwright"], + "selenium": ["selenium>4", "webdriver_manager<3.9"], + }, cmdclass={ - 'install_mermaid': InstallMermaidCLI, + "install_mermaid": InstallMermaidCLI, }, ) From fac1cb0f595a5224a2605edc8603ff1492e072b9 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Tue, 25 Jul 2023 00:15:16 +0800 Subject: [PATCH 4/9] use `CONFIG` instead of `Config()` --- metagpt/tools/__init__.py | 6 +++--- metagpt/tools/web_browser_engine.py | 7 +++---- metagpt/tools/web_browser_engine_playwright.py | 14 ++++++-------- metagpt/tools/web_browser_engine_selenium.py | 10 ++++------ .../tools/test_web_browser_engine_playwright.py | 9 ++++----- .../tools/test_web_browser_engine_selenium.py | 9 ++++----- 6 files changed, 24 insertions(+), 31 deletions(-) diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py index 6f418baf4..f9b7abc52 100644 --- a/metagpt/tools/__init__.py +++ b/metagpt/tools/__init__.py @@ -18,6 +18,6 @@ class SearchEngineType(Enum): class WebBrowserEngineType(Enum): - PLAYWRIGHT = auto() - SELENIUM = auto() - CUSTOM_ENGINE = auto() + PLAYWRIGHT = "playwright" + SELENIUM = "selenium" + CUSTOM = "custom" diff --git a/metagpt/tools/web_browser_engine.py b/metagpt/tools/web_browser_engine.py index f38b4bd8f..629ada195 100644 --- a/metagpt/tools/web_browser_engine.py +++ b/metagpt/tools/web_browser_engine.py @@ -6,7 +6,7 @@ import importlib from typing import Any, Callable, Coroutine, overload -from metagpt.config import Config +from metagpt.config import CONFIG from metagpt.tools import WebBrowserEngineType from bs4 import BeautifulSoup @@ -17,8 +17,7 @@ class WebBrowserEngine: engine: WebBrowserEngineType | None = None, run_func: Callable[..., Coroutine[Any, Any, str | list[str]]] | None = None, ): - self.config = Config() - engine = engine or self.config.web_browser_engine + engine = engine or CONFIG.web_browser_engine if engine == WebBrowserEngineType.PLAYWRIGHT: web_browser_engine = importlib.import_module("metagpt.tools.web_browser_engine_playwright") @@ -26,7 +25,7 @@ class WebBrowserEngine: elif engine == WebBrowserEngineType.SELENIUM: web_browser_engine = importlib.import_module("metagpt.tools.web_browser_engine_selenium") run_func = web_browser_engine.SeleniumWrapper().run - elif engine == WebBrowserEngineType.CUSTOM_ENGINE: + elif engine == WebBrowserEngineType.CUSTOM: run_func = run_func else: raise NotImplementedError diff --git a/metagpt/tools/web_browser_engine_playwright.py b/metagpt/tools/web_browser_engine_playwright.py index 6546d7a29..5fef7bd1c 100644 --- a/metagpt/tools/web_browser_engine_playwright.py +++ b/metagpt/tools/web_browser_engine_playwright.py @@ -6,7 +6,7 @@ from pathlib import Path import sys from typing import Literal from playwright.async_api import async_playwright -from metagpt.config import Config +from metagpt.config import CONFIG from metagpt.logs import logger @@ -24,16 +24,14 @@ class PlaywrightWrapper: launch_kwargs: dict | None = None, **kwargs, ) -> None: - config = Config() - self.config = config if browser_type is None: - browser_type = config.playwright_browser_type + browser_type = CONFIG.playwright_browser_type self.browser_type = browser_type launch_kwargs = launch_kwargs or {} - if config.global_proxy and "proxy" not in launch_kwargs: + if CONFIG.global_proxy and "proxy" not in launch_kwargs: args = launch_kwargs.get("args", []) if not any(str.startswith(i, "--proxy-server=") for i in args): - launch_kwargs["proxy"] = {"server": config.global_proxy} + launch_kwargs["proxy"] = {"server": CONFIG.global_proxy} self.launch_kwargs = launch_kwargs context_kwargs = {} if "ignore_https_errors" in kwargs: @@ -70,8 +68,8 @@ class PlaywrightWrapper: executable_path = Path(browser_type.executable_path) if not executable_path.exists() and "executable_path" not in self.launch_kwargs: kwargs = {} - if self.config.global_proxy: - kwargs["env"] = {"ALL_PROXY": self.config.global_proxy} + if CONFIG.global_proxy: + kwargs["env"] = {"ALL_PROXY": CONFIG.global_proxy} await _install_browsers(self.browser_type, **kwargs) if not executable_path.exists(): parts = executable_path.parts diff --git a/metagpt/tools/web_browser_engine_selenium.py b/metagpt/tools/web_browser_engine_selenium.py index d5d01daf2..f36f2dfbc 100644 --- a/metagpt/tools/web_browser_engine_selenium.py +++ b/metagpt/tools/web_browser_engine_selenium.py @@ -5,7 +5,7 @@ import asyncio from copy import deepcopy import importlib from typing import Literal -from metagpt.config import Config +from metagpt.config import CONFIG import asyncio from selenium.webdriver.common.by import By @@ -29,14 +29,12 @@ class SeleniumWrapper: loop: asyncio.AbstractEventLoop | None = None, executor: futures.Executor | None = None, ) -> None: - config = Config() - self.config = config if browser_type is None: - browser_type = config.selenium_browser_type + browser_type = CONFIG.selenium_browser_type self.browser_type = browser_type launch_kwargs = launch_kwargs or {} - if config.global_proxy and "proxy-server" not in launch_kwargs: - launch_kwargs["proxy-server"] = config.global_proxy + if CONFIG.global_proxy and "proxy-server" not in launch_kwargs: + launch_kwargs["proxy-server"] = CONFIG.global_proxy self.executable_path = launch_kwargs.pop("executable_path", None) self.launch_args = [f"--{k}={v}" for k, v in launch_kwargs.items()] diff --git a/tests/metagpt/tools/test_web_browser_engine_playwright.py b/tests/metagpt/tools/test_web_browser_engine_playwright.py index afca35d52..908f92112 100644 --- a/tests/metagpt/tools/test_web_browser_engine_playwright.py +++ b/tests/metagpt/tools/test_web_browser_engine_playwright.py @@ -1,5 +1,5 @@ import pytest -from metagpt.config import Config +from metagpt.config import CONFIG from metagpt.tools import web_browser_engine_playwright @@ -15,10 +15,9 @@ from metagpt.tools import web_browser_engine_playwright ) async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy, capfd): try: - config = Config() - global_proxy = config.global_proxy + global_proxy = CONFIG.global_proxy if use_proxy: - config.global_proxy = proxy + CONFIG.global_proxy = proxy browser = web_browser_engine_playwright.PlaywrightWrapper(browser_type, **kwagrs) result = await browser.run(url) assert isinstance(result, str) @@ -32,4 +31,4 @@ async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy if use_proxy: assert "Proxy:" in capfd.readouterr().out finally: - config.global_proxy = global_proxy + CONFIG.global_proxy = global_proxy diff --git a/tests/metagpt/tools/test_web_browser_engine_selenium.py b/tests/metagpt/tools/test_web_browser_engine_selenium.py index 752938766..5ea1e3083 100644 --- a/tests/metagpt/tools/test_web_browser_engine_selenium.py +++ b/tests/metagpt/tools/test_web_browser_engine_selenium.py @@ -1,5 +1,5 @@ import pytest -from metagpt.config import Config +from metagpt.config import CONFIG from metagpt.tools import web_browser_engine_selenium @@ -15,10 +15,9 @@ from metagpt.tools import web_browser_engine_selenium ) async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd): try: - config = Config() - global_proxy = config.global_proxy + global_proxy = CONFIG.global_proxy if use_proxy: - Config().global_proxy = proxy + CONFIG.global_proxy = proxy browser = web_browser_engine_selenium.SeleniumWrapper(browser_type) result = await browser.run(url) assert isinstance(result, str) @@ -32,4 +31,4 @@ async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd) if use_proxy: assert "Proxy:" in capfd.readouterr().out finally: - config.global_proxy = global_proxy + CONFIG.global_proxy = global_proxy From c84b337ef7318c7537b8a7a5b356ac158b227906 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Tue, 25 Jul 2023 00:23:37 +0800 Subject: [PATCH 5/9] add an example configuration for web access --- config/config.yaml | 12 ++++++++++++ metagpt/config.py | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index fbd9aa81e..1063babbe 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -5,6 +5,7 @@ #OPENAI_API_KEY: "YOUR_API_KEY" #OPENAI_API_BASE: "YOUR_API_BASE" +#OPENAI_PROXY: "http://127.0.0.1:8118" OPENAI_API_MODEL: "gpt-4" MAX_TOKENS: 1500 RPM: 10 @@ -31,6 +32,17 @@ RPM: 10 ## Visit https://serper.dev/ to get key. #SERPER_API_KEY: "YOUR_API_KEY" +#### for web access + +## Supported values: playwright/selenium +#WEB_BROWSER_ENGINE: playwright + +## Supported values: chromium/firefox/webkit, visit https://playwright.dev/python/docs/api/class-browsertype +##PLAYWRIGHT_BROWSER_TYPE: chromium + +## Supported values: chrome/firefox/edge/ie, visit https://www.selenium.dev/documentation/webdriver/browsers/ +# SELENIUM_BROWSER_TYPE: chrome + #### for TTS #AZURE_TTS_SUBSCRIPTION_KEY: "YOUR_API_KEY" diff --git a/metagpt/config.py b/metagpt/config.py index 8d8725776..bc78d44ba 100644 --- a/metagpt/config.py +++ b/metagpt/config.py @@ -65,9 +65,9 @@ class Config(metaclass=Singleton): self.google_api_key = self._get("GOOGLE_API_KEY") self.google_cse_id = self._get("GOOGLE_CSE_ID") self.search_engine = self._get("SEARCH_ENGINE", SearchEngineType.SERPAPI_GOOGLE) - self.web_browser_engine = self._get("WEB_BROWSER_ENGINE", WebBrowserEngineType.PLAYWRIGHT) + self.web_browser_engine = WebBrowserEngineType(self._get("WEB_BROWSER_ENGINE", "playwright")) self.playwright_browser_type = self._get("PLAYWRIGHT_BROWSER_TYPE", "chromium") - self.selenium_browser_type = self._get("selenium_browser_type", "chrome") + self.selenium_browser_type = self._get("SELENIUM_BROWSER_TYPE", "chrome") self.max_budget = self._get("MAX_BUDGET", 10.0) self.total_cost = 0.0 From 0700cd2e62477227034017ea3fd8bc65f98446cb Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Tue, 25 Jul 2023 00:32:02 +0800 Subject: [PATCH 6/9] add parse_func for WebBrowserEngin to support custom parsing --- metagpt/tools/web_browser_engine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/metagpt/tools/web_browser_engine.py b/metagpt/tools/web_browser_engine.py index 629ada195..90449d8e1 100644 --- a/metagpt/tools/web_browser_engine.py +++ b/metagpt/tools/web_browser_engine.py @@ -16,6 +16,7 @@ class WebBrowserEngine: self, engine: WebBrowserEngineType | None = None, run_func: Callable[..., Coroutine[Any, Any, str | list[str]]] | None = None, + parse_func: Callable[[str], str] | None = None, ): engine = engine or CONFIG.web_browser_engine @@ -29,6 +30,7 @@ class WebBrowserEngine: run_func = run_func else: raise NotImplementedError + self.parse_func = parse_func or get_page_content self.run_func = run_func self.engine = engine @@ -43,8 +45,8 @@ class WebBrowserEngine: async def run(self, url: str, *urls: str) -> str | list[str]: page = await self.run_func(url, *urls) if isinstance(page, str): - return get_page_content(page) - return [get_page_content(i) for i in page] + return self.parse_func(page) + return [self.parse_func(i) for i in page] def get_page_content(page: str): From a6f13b693847298123f3b84deee571b8b17d880f Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Tue, 25 Jul 2023 00:48:32 +0800 Subject: [PATCH 7/9] remove inappropriate variable names used in the definition of the class WebBrowserEngine. --- metagpt/tools/web_browser_engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metagpt/tools/web_browser_engine.py b/metagpt/tools/web_browser_engine.py index 90449d8e1..d1f83934f 100644 --- a/metagpt/tools/web_browser_engine.py +++ b/metagpt/tools/web_browser_engine.py @@ -21,11 +21,11 @@ class WebBrowserEngine: engine = engine or CONFIG.web_browser_engine if engine == WebBrowserEngineType.PLAYWRIGHT: - web_browser_engine = importlib.import_module("metagpt.tools.web_browser_engine_playwright") - run_func = web_browser_engine.PlaywrightWrapper().run + module = "metagpt.tools.web_browser_engine_playwright" + run_func = importlib.import_module(module).PlaywrightWrapper().run elif engine == WebBrowserEngineType.SELENIUM: - web_browser_engine = importlib.import_module("metagpt.tools.web_browser_engine_selenium") - run_func = web_browser_engine.SeleniumWrapper().run + module = "metagpt.tools.web_browser_engine_selenium" + run_func = importlib.import_module(module).SeleniumWrapper().run elif engine == WebBrowserEngineType.CUSTOM: run_func = run_func else: From b6542768ee4ca5446ef87c7a3014001f8028e6d0 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Tue, 25 Jul 2023 01:31:58 +0800 Subject: [PATCH 8/9] update extras_require --- setup.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 0b248c7b0..e65696901 100644 --- a/setup.py +++ b/setup.py @@ -43,12 +43,8 @@ setup( python_requires=">=3.9", install_requires=requirements, extras_require={ - "web_browser": [ - "playwright>=1.26", - "beautifulsoup4", - ], - "playwright": ["playwright"], - "selenium": ["selenium>4", "webdriver_manager<3.9"], + "playwright": ["playwright>=1.26", "beautifulsoup4"], + "selenium": ["selenium>4", "webdriver_manager<3.9", "beautifulsoup4"], }, cmdclass={ "install_mermaid": InstallMermaidCLI, From 8cb233ccffe55f29d3b3a4c38ad70b1854deb7c0 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Tue, 25 Jul 2023 01:48:09 +0800 Subject: [PATCH 9/9] update docs for PlaywrightWrapper/SeleniumWrapper --- metagpt/tools/web_browser_engine_playwright.py | 7 ++++--- metagpt/tools/web_browser_engine_selenium.py | 9 +++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/metagpt/tools/web_browser_engine_playwright.py b/metagpt/tools/web_browser_engine_playwright.py index 5fef7bd1c..ae8644cce 100644 --- a/metagpt/tools/web_browser_engine_playwright.py +++ b/metagpt/tools/web_browser_engine_playwright.py @@ -13,9 +13,10 @@ from metagpt.logs import logger class PlaywrightWrapper: """Wrapper around Playwright. - To use this module, you should have the ``playwright`` Python package installed and ensure - that the required browsers are also installed. You can download the necessary browser binaries - by running the command `playwright install` for the first time. + To use this module, you should have the `playwright` Python package installed and ensure that + the required browsers are also installed. You can install playwright by running the command + `pip install metagpt[playwright]` and download the necessary browser binaries by running the + command `playwright install` for the first time." """ def __init__( diff --git a/metagpt/tools/web_browser_engine_selenium.py b/metagpt/tools/web_browser_engine_selenium.py index f36f2dfbc..bd8a456ea 100644 --- a/metagpt/tools/web_browser_engine_selenium.py +++ b/metagpt/tools/web_browser_engine_selenium.py @@ -17,8 +17,13 @@ from concurrent import futures class SeleniumWrapper: """Wrapper around Selenium. - To use this module, you should have the ``selenium`` Python package installed and ensure - that the required browsers are also installed. + To use this module, you should check the following: + + 1. Run the following command: pip install metagpt[selenium]. + 2. Make sure you have a compatible web browser installed and the appropriate WebDriver set up + for that browser before running. For example, if you have Mozilla Firefox installed on your + computer, you can set the configuration SELENIUM_BROWSER_TYPE to firefox. After that, you + can scrape web pages using the Selenium WebBrowserEngine. """ def __init__(