diff --git a/metagpt/tools/libs/web_scraping.py b/metagpt/tools/libs/web_scraping.py index 489c3a472..9e7a8041c 100644 --- a/metagpt/tools/libs/web_scraping.py +++ b/metagpt/tools/libs/web_scraping.py @@ -8,13 +8,15 @@ from metagpt.utils.parse_html import simplify_html @register_tool(tags=["web scraping"]) -async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> None: - """view the HTML content of current page to understand the structure. When executed, the content will be printed out +async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> str: + """view the HTML content of current page to understand the structure. Args: url (str): The URL of the web page to scrape. requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements. keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required + Returns: + str: The HTML content of the page. """ async with Browser() as browser: await browser.goto(url) @@ -36,7 +38,7 @@ async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bo html = "\n".join(i.text for i in nodes) mem_fs.rm_file(filename) - print(html) + return html # async def get_elements_outerhtml(self, element_ids: list[int]): diff --git a/metagpt/utils/parse_html.py b/metagpt/utils/parse_html.py index 1ed3a620c..031393501 100644 --- a/metagpt/utils/parse_html.py +++ b/metagpt/utils/parse_html.py @@ -41,13 +41,13 @@ class WebPage(BaseModel): def get_slim_soup(self, keep_links: bool = False): soup = _get_soup(self.html) - keep_attrs = ["class"] + keep_attrs = ["class", "id"] if keep_links: - keep_attrs.append("href") + keep_attrs.extend(["href", "title"]) for i in soup.find_all(True): for name in list(i.attrs): - if i[name] and name not in keep_attrs: + if i[name] and name not in keep_attrs and not name.startswith("data-"): del i[name] for i in soup.find_all(["svg", "img", "video", "audio"]):