integrate web scraping tool

2026-07-23 17:01:08 +02:00 · 2024-01-18 23:26:34 +08:00 · 2024-01-18 23:26:34 +08:00 · 88c4c8c90d
commit 88c4c8c90d
parent c32dcca293
8 changed files with 18 additions and 11 deletions
--- a/metagpt/tools/init.py
+++ b/metagpt/tools/init.py
@ -11,7 +11,7 @@ from metagpt.tools import tool_types  # this registers all tool types
 from metagpt.tools import libs  # this registers all tools
 from metagpt.tools.tool_registry import TOOL_REGISTRY

-_, _, _ = tool_types, libs, TOOL_REGISTRY  # Avoid pre-commit error
+_ = tool_types, libs, TOOL_REGISTRY  # Avoid pre-commit error


 class SearchEngineType(Enum):
--- a/metagpt/tools/functions/libs/scrape_web/init.py
+++ b/metagpt/tools/functions/libs/scrape_web/init.py
@ -1 +0,0 @@
-from metagpt.tools.functions.libs.scrape_web.scrape_web import scrape_web
--- a/metagpt/tools/libs/init.py
+++ b/metagpt/tools/libs/init.py
@ -9,6 +9,7 @@ from metagpt.tools.libs import (
    feature_engineering,
    sd_engine,
    gpt_v_generator,
+    web_scrapping,
 )

-_, _, _, _ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator  # Avoid pre-commit error
+_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scrapping  # Avoid pre-commit error
--- a/metagpt/tools/functions/libs/scrape_web/scrape_web.py
+++ b/metagpt/tools/functions/libs/scrape_web/scrape_web.py
@ -1,9 +1,10 @@
-import asyncio
-
+from metagpt.tools.tool_data_type import ToolTypeEnum
+from metagpt.tools.tool_registry import register_tool
 from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper


-async def scrape_web(url, *urls):
+@register_tool(tool_type=ToolTypeEnum.WEBSCRAPING.value)
+async def scrape_web_playwright(url, *urls):
    """
    Scrape and save the HTML structure and inner text content of a web page using Playwright.

@ -19,5 +20,3 @@ async def scrape_web(url, *urls):

    # Return the inner text content of the web page
    return {"inner_text": web.inner_text, "html": web.html}
-
-# 需要改三个地方: yaml, 对应路径下init, MetaGPT/metagpt/prompts/ml_engineer.py中ML_MODULE_MAP
--- a/metagpt/tools/schemas/web_scrapping/scrape_web_playwright.yml
+++ b/metagpt/tools/schemas/web_scrapping/scrape_web_playwright.yml
@ -1,4 +1,4 @@
-scrape_web:
+scrape_web_playwright:
  type: async funciton
  description: "Scrape and save the HTML structure and inner text content of a web page using Playwright."
  parameters:
--- a/metagpt/tools/tool_data_type.py
+++ b/metagpt/tools/tool_data_type.py
@ -11,6 +11,7 @@ class ToolTypeEnum(Enum):
    MODEL_EVALUATE = "model_evaluate"
    STABLE_DIFFUSION = "stable_diffusion"
    IMAGE2WEBPAGE = "image2webpage"
+    WEBSCRAPING = "web_scraping"
    OTHER = "other"

    def __missing__(self, key):
--- a/metagpt/tools/tool_types.py
+++ b/metagpt/tools/tool_types.py
@ -12,7 +12,7 @@ from metagpt.tools.tool_registry import register_tool_type
@register_tool_type
 class EDA(ToolType):
    name: str = ToolTypeEnum.EDA.value
-    desc: str = "Useful for performing exploratory data analysis"
+    desc: str = "For performing exploratory data analysis"


@register_tool_type
@ -56,6 +56,12 @@ class Image2Webpage(ToolType):
    usage_prompt: str = IMAGE2WEBPAGE_PROMPT


+@register_tool_type
+class WebScraping(ToolType):
+    name: str = ToolTypeEnum.WEBSCRAPING.value
+    desc: str = "For scraping data from web pages."
+
+
@register_tool_type
 class Other(ToolType):
    name: str = ToolTypeEnum.OTHER.value
--- a/metagpt/tools/web_browser_engine_playwright.py
+++ b/metagpt/tools/web_browser_engine_playwright.py
@ -12,7 +12,6 @@ from typing import Literal

 from playwright.async_api import async_playwright

-from metagpt.config import CONFIG
 from metagpt.logs import logger
 from metagpt.utils.parse_html import WebPage

@ -32,6 +31,8 @@ class PlaywrightWrapper:
        launch_kwargs: dict | None = None,
        **kwargs,
    ) -> None:
+        from metagpt.config import CONFIG
+
        if browser_type is None:
            browser_type = CONFIG.playwright_browser_type
        self.browser_type = browser_type
				`@ -1 +0,0 @@`
				`from metagpt.tools.functions.libs.scrape_web.scrape_web import scrape_web`