update webscraping tool

This commit is contained in:
yzlin 2024-02-05 22:15:47 +08:00
parent 23c27627ce
commit 9b72370cbe
2 changed files with 5 additions and 6 deletions

View file

@ -10,7 +10,7 @@ from metagpt.roles.ci.code_interpreter import CodeInterpreter
async def main():
prompt = """Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key data*"""
and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables*"""
ci = CodeInterpreter(goal=prompt, use_tools=True)
await ci.run(prompt)

View file

@ -4,19 +4,18 @@ from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper
@register_tool(tool_type=ToolType.WEBSCRAPING.type_name)
async def scrape_web_playwright(url, *urls):
async def scrape_web_playwright(url):
"""
Scrape and save the HTML structure and inner text content of a web page using Playwright.
Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright.
Args:
url (str): The main URL to fetch inner text from.
*urls (str): Additional URLs to fetch inner text from.
Returns:
(dict): The inner text content and html structure of the web page, key are : 'inner_text', 'html'.
dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'.
"""
# Create a PlaywrightWrapper instance for the Chromium browser
web = await PlaywrightWrapper().run(url, *urls)
web = await PlaywrightWrapper().run(url)
# Return the inner text content of the web page
return {"inner_text": web.inner_text.strip(), "html": web.html.strip()}