update webscraping tool

2026-07-23 17:01:08 +02:00 · 2024-02-05 22:15:47 +08:00 · 2024-02-05 22:15:47 +08:00 · 9b72370cbe
commit 9b72370cbe
parent 23c27627ce
2 changed files with 5 additions and 6 deletions
--- a/examples/crawl_webpage.py
+++ b/examples/crawl_webpage.py
@ -10,7 +10,7 @@ from metagpt.roles.ci.code_interpreter import CodeInterpreter

 async def main():
    prompt = """Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
-    and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key data*"""
+    and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables*"""
    ci = CodeInterpreter(goal=prompt, use_tools=True)

    await ci.run(prompt)
--- a/metagpt/tools/libs/web_scraping.py
+++ b/metagpt/tools/libs/web_scraping.py
@ -4,19 +4,18 @@ from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper


@register_tool(tool_type=ToolType.WEBSCRAPING.type_name)
-async def scrape_web_playwright(url, *urls):
+async def scrape_web_playwright(url):
    """
-    Scrape and save the HTML structure and inner text content of a web page using Playwright.
+    Asynchronously Scrape and save the HTML structure and inner text content of a web page using Playwright.

    Args:
        url (str): The main URL to fetch inner text from.
-        *urls (str): Additional URLs to fetch inner text from.

    Returns:
-        (dict): The inner text content and html structure of the web page, key are : 'inner_text', 'html'.
+        dict: The inner text content and html structure of the web page, keys are 'inner_text', 'html'.
    """
    # Create a PlaywrightWrapper instance for the Chromium browser
-    web = await PlaywrightWrapper().run(url, *urls)
+    web = await PlaywrightWrapper().run(url)

    # Return the inner text content of the web page
    return {"inner_text": web.inner_text.strip(), "html": web.html.strip()}