refine web_scraping tool

2026-05-15 11:02:36 +02:00 · 2024-07-09 17:02:44 +08:00 · 2024-07-09 17:02:44 +08:00 · ae861d99cd
commit ae861d99cd
parent 4c4d9547ff
2 changed files with 8 additions and 6 deletions
--- a/metagpt/tools/libs/web_scraping.py
+++ b/metagpt/tools/libs/web_scraping.py
@ -8,13 +8,15 @@ from metagpt.utils.parse_html import simplify_html


@register_tool(tags=["web scraping"])
-async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> None:
-    """view the HTML content of current page to understand the structure. When executed, the content will be printed out
+async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> str:
+    """view the HTML content of current page to understand the structure.

    Args:
        url (str): The URL of the web page to scrape.
        requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements.
        keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required
+    Returns:
+        str: The HTML content of the page.
    """
    async with Browser() as browser:
        await browser.goto(url)
@ -36,7 +38,7 @@ async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bo
        html = "\n".join(i.text for i in nodes)

    mem_fs.rm_file(filename)
-    print(html)
+    return html


 # async def get_elements_outerhtml(self, element_ids: list[int]):
--- a/metagpt/utils/parse_html.py
+++ b/metagpt/utils/parse_html.py
@ -41,13 +41,13 @@ class WebPage(BaseModel):

    def get_slim_soup(self, keep_links: bool = False):
        soup = _get_soup(self.html)
-        keep_attrs = ["class"]
+        keep_attrs = ["class", "id"]
        if keep_links:
-            keep_attrs.append("href")
+            keep_attrs.extend(["href", "title"])

        for i in soup.find_all(True):
            for name in list(i.attrs):
-                if i[name] and name not in keep_attrs:
+                if i[name] and name not in keep_attrs and not name.startswith("data-"):
                    del i[name]

        for i in soup.find_all(["svg", "img", "video", "audio"]):