mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-02 04:12:45 +02:00
integrate web scraping tool
This commit is contained in:
parent
c32dcca293
commit
88c4c8c90d
8 changed files with 18 additions and 11 deletions
|
|
@ -11,7 +11,7 @@ from metagpt.tools import tool_types # this registers all tool types
|
|||
from metagpt.tools import libs # this registers all tools
|
||||
from metagpt.tools.tool_registry import TOOL_REGISTRY
|
||||
|
||||
_, _, _ = tool_types, libs, TOOL_REGISTRY # Avoid pre-commit error
|
||||
_ = tool_types, libs, TOOL_REGISTRY # Avoid pre-commit error
|
||||
|
||||
|
||||
class SearchEngineType(Enum):
|
||||
|
|
|
|||
|
|
@ -1 +0,0 @@
|
|||
from metagpt.tools.functions.libs.scrape_web.scrape_web import scrape_web
|
||||
|
|
@ -9,6 +9,7 @@ from metagpt.tools.libs import (
|
|||
feature_engineering,
|
||||
sd_engine,
|
||||
gpt_v_generator,
|
||||
web_scrapping,
|
||||
)
|
||||
|
||||
_, _, _, _ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator # Avoid pre-commit error
|
||||
_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scrapping # Avoid pre-commit error
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
import asyncio
|
||||
|
||||
from metagpt.tools.tool_data_type import ToolTypeEnum
|
||||
from metagpt.tools.tool_registry import register_tool
|
||||
from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper
|
||||
|
||||
|
||||
async def scrape_web(url, *urls):
|
||||
@register_tool(tool_type=ToolTypeEnum.WEBSCRAPING.value)
|
||||
async def scrape_web_playwright(url, *urls):
|
||||
"""
|
||||
Scrape and save the HTML structure and inner text content of a web page using Playwright.
|
||||
|
||||
|
|
@ -19,5 +20,3 @@ async def scrape_web(url, *urls):
|
|||
|
||||
# Return the inner text content of the web page
|
||||
return {"inner_text": web.inner_text, "html": web.html}
|
||||
|
||||
# 需要改三个地方: yaml, 对应路径下init, MetaGPT/metagpt/prompts/ml_engineer.py中ML_MODULE_MAP
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
scrape_web:
|
||||
scrape_web_playwright:
|
||||
type: async funciton
|
||||
description: "Scrape and save the HTML structure and inner text content of a web page using Playwright."
|
||||
parameters:
|
||||
|
|
@ -11,6 +11,7 @@ class ToolTypeEnum(Enum):
|
|||
MODEL_EVALUATE = "model_evaluate"
|
||||
STABLE_DIFFUSION = "stable_diffusion"
|
||||
IMAGE2WEBPAGE = "image2webpage"
|
||||
WEBSCRAPING = "web_scraping"
|
||||
OTHER = "other"
|
||||
|
||||
def __missing__(self, key):
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ from metagpt.tools.tool_registry import register_tool_type
|
|||
@register_tool_type
|
||||
class EDA(ToolType):
|
||||
name: str = ToolTypeEnum.EDA.value
|
||||
desc: str = "Useful for performing exploratory data analysis"
|
||||
desc: str = "For performing exploratory data analysis"
|
||||
|
||||
|
||||
@register_tool_type
|
||||
|
|
@ -56,6 +56,12 @@ class Image2Webpage(ToolType):
|
|||
usage_prompt: str = IMAGE2WEBPAGE_PROMPT
|
||||
|
||||
|
||||
@register_tool_type
|
||||
class WebScraping(ToolType):
|
||||
name: str = ToolTypeEnum.WEBSCRAPING.value
|
||||
desc: str = "For scraping data from web pages."
|
||||
|
||||
|
||||
@register_tool_type
|
||||
class Other(ToolType):
|
||||
name: str = ToolTypeEnum.OTHER.value
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ from typing import Literal
|
|||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from metagpt.config import CONFIG
|
||||
from metagpt.logs import logger
|
||||
from metagpt.utils.parse_html import WebPage
|
||||
|
||||
|
|
@ -32,6 +31,8 @@ class PlaywrightWrapper:
|
|||
launch_kwargs: dict | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
from metagpt.config import CONFIG
|
||||
|
||||
if browser_type is None:
|
||||
browser_type = CONFIG.playwright_browser_type
|
||||
self.browser_type = browser_type
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue