From 75628caf4d68b7519c63a84c0203326ea05ace5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E6=A3=92=E6=A3=92?= Date: Mon, 15 Jan 2024 18:10:57 +0800 Subject: [PATCH] add scrape_web.py --- .../functions/libs/scrape_web/__init__.py | 1 + .../functions/libs/scrape_web/scrape_web.py | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 metagpt/tools/functions/libs/scrape_web/__init__.py create mode 100644 metagpt/tools/functions/libs/scrape_web/scrape_web.py diff --git a/metagpt/tools/functions/libs/scrape_web/__init__.py b/metagpt/tools/functions/libs/scrape_web/__init__.py new file mode 100644 index 000000000..d5cd1524b --- /dev/null +++ b/metagpt/tools/functions/libs/scrape_web/__init__.py @@ -0,0 +1 @@ +from metagpt.tools.functions.libs.scrape_web.scrape_web import scrape_web diff --git a/metagpt/tools/functions/libs/scrape_web/scrape_web.py b/metagpt/tools/functions/libs/scrape_web/scrape_web.py new file mode 100644 index 000000000..5cd984f4d --- /dev/null +++ b/metagpt/tools/functions/libs/scrape_web/scrape_web.py @@ -0,0 +1,26 @@ +import asyncio + +from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper + + +async def scrape_web(url, *urls): + """ + Scrape and save the HTML structure and inner text content of a web page using Playwright. + + Args: + url (str): The main URL to fetch inner text from. + *urls (str): Additional URLs to fetch inner text from. + + Returns: + (dict): The inner text content and html structure of the web page, key are : 'inner_text', 'html'. + + Raises: + Any exceptions that may occur during the Playwright operation. + """ + # Create a PlaywrightWrapper instance for the Chromium browser + web = await PlaywrightWrapper("chromium").run(url, *urls) + + # Return the inner text content of the web page + return {"inner_text": web.inner_text, "html": web.html} + +# 需要改三个地方: yaml, 对应路径下init, MetaGPT/metagpt/prompts/ml_engineer.py中ML_MODULE_MAP