integrate web scraping tool

This commit is contained in:
yzlin 2024-01-18 23:26:34 +08:00
parent c32dcca293
commit 88c4c8c90d
8 changed files with 18 additions and 11 deletions

View file

@ -11,7 +11,7 @@ from metagpt.tools import tool_types # this registers all tool types
from metagpt.tools import libs # this registers all tools
from metagpt.tools.tool_registry import TOOL_REGISTRY
_, _, _ = tool_types, libs, TOOL_REGISTRY # Avoid pre-commit error
_ = tool_types, libs, TOOL_REGISTRY # Avoid pre-commit error
class SearchEngineType(Enum):

View file

@ -1 +0,0 @@
from metagpt.tools.functions.libs.scrape_web.scrape_web import scrape_web

View file

@ -9,6 +9,7 @@ from metagpt.tools.libs import (
feature_engineering,
sd_engine,
gpt_v_generator,
web_scrapping,
)
_, _, _, _ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator # Avoid pre-commit error
_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scrapping # Avoid pre-commit error

View file

@ -1,9 +1,10 @@
import asyncio
from metagpt.tools.tool_data_type import ToolTypeEnum
from metagpt.tools.tool_registry import register_tool
from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper
async def scrape_web(url, *urls):
@register_tool(tool_type=ToolTypeEnum.WEBSCRAPING.value)
async def scrape_web_playwright(url, *urls):
"""
Scrape and save the HTML structure and inner text content of a web page using Playwright.
@ -19,5 +20,3 @@ async def scrape_web(url, *urls):
# Return the inner text content of the web page
return {"inner_text": web.inner_text, "html": web.html}
# 需要改三个地方: yaml, 对应路径下init, MetaGPT/metagpt/prompts/ml_engineer.py中ML_MODULE_MAP

View file

@ -1,4 +1,4 @@
scrape_web:
scrape_web_playwright:
type: async funciton
description: "Scrape and save the HTML structure and inner text content of a web page using Playwright."
parameters:

View file

@ -11,6 +11,7 @@ class ToolTypeEnum(Enum):
MODEL_EVALUATE = "model_evaluate"
STABLE_DIFFUSION = "stable_diffusion"
IMAGE2WEBPAGE = "image2webpage"
WEBSCRAPING = "web_scraping"
OTHER = "other"
def __missing__(self, key):

View file

@ -12,7 +12,7 @@ from metagpt.tools.tool_registry import register_tool_type
@register_tool_type
class EDA(ToolType):
name: str = ToolTypeEnum.EDA.value
desc: str = "Useful for performing exploratory data analysis"
desc: str = "For performing exploratory data analysis"
@register_tool_type
@ -56,6 +56,12 @@ class Image2Webpage(ToolType):
usage_prompt: str = IMAGE2WEBPAGE_PROMPT
@register_tool_type
class WebScraping(ToolType):
name: str = ToolTypeEnum.WEBSCRAPING.value
desc: str = "For scraping data from web pages."
@register_tool_type
class Other(ToolType):
name: str = ToolTypeEnum.OTHER.value

View file

@ -12,7 +12,6 @@ from typing import Literal
from playwright.async_api import async_playwright
from metagpt.config import CONFIG
from metagpt.logs import logger
from metagpt.utils.parse_html import WebPage
@ -32,6 +31,8 @@ class PlaywrightWrapper:
launch_kwargs: dict | None = None,
**kwargs,
) -> None:
from metagpt.config import CONFIG
if browser_type is None:
browser_type = CONFIG.playwright_browser_type
self.browser_type = browser_type