add scrape_web.py

This commit is contained in:
刘棒棒 2024-01-15 18:10:57 +08:00
parent 00f7f93234
commit 75628caf4d
2 changed files with 27 additions and 0 deletions

View file

@ -0,0 +1 @@
from metagpt.tools.functions.libs.scrape_web.scrape_web import scrape_web

View file

@ -0,0 +1,26 @@
import asyncio
from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper
async def scrape_web(url, *urls):
"""
Scrape and save the HTML structure and inner text content of a web page using Playwright.
Args:
url (str): The main URL to fetch inner text from.
*urls (str): Additional URLs to fetch inner text from.
Returns:
(dict): The inner text content and html structure of the web page, key are : 'inner_text', 'html'.
Raises:
Any exceptions that may occur during the Playwright operation.
"""
# Create a PlaywrightWrapper instance for the Chromium browser
web = await PlaywrightWrapper("chromium").run(url, *urls)
# Return the inner text content of the web page
return {"inner_text": web.inner_text, "html": web.html}
# 需要改三个地方: yaml, 对应路径下init, MetaGPT/metagpt/prompts/ml_engineer.py中ML_MODULE_MAP