mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-15 11:02:36 +02:00
refine web_scraping tool
This commit is contained in:
parent
4c4d9547ff
commit
ae861d99cd
2 changed files with 8 additions and 6 deletions
|
|
@ -8,13 +8,15 @@ from metagpt.utils.parse_html import simplify_html
|
|||
|
||||
|
||||
@register_tool(tags=["web scraping"])
|
||||
async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> None:
|
||||
"""view the HTML content of current page to understand the structure. When executed, the content will be printed out
|
||||
async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> str:
|
||||
"""view the HTML content of current page to understand the structure.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the web page to scrape.
|
||||
requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements.
|
||||
keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required
|
||||
Returns:
|
||||
str: The HTML content of the page.
|
||||
"""
|
||||
async with Browser() as browser:
|
||||
await browser.goto(url)
|
||||
|
|
@ -36,7 +38,7 @@ async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bo
|
|||
html = "\n".join(i.text for i in nodes)
|
||||
|
||||
mem_fs.rm_file(filename)
|
||||
print(html)
|
||||
return html
|
||||
|
||||
|
||||
# async def get_elements_outerhtml(self, element_ids: list[int]):
|
||||
|
|
|
|||
|
|
@ -41,13 +41,13 @@ class WebPage(BaseModel):
|
|||
|
||||
def get_slim_soup(self, keep_links: bool = False):
|
||||
soup = _get_soup(self.html)
|
||||
keep_attrs = ["class"]
|
||||
keep_attrs = ["class", "id"]
|
||||
if keep_links:
|
||||
keep_attrs.append("href")
|
||||
keep_attrs.extend(["href", "title"])
|
||||
|
||||
for i in soup.find_all(True):
|
||||
for name in list(i.attrs):
|
||||
if i[name] and name not in keep_attrs:
|
||||
if i[name] and name not in keep_attrs and not name.startswith("data-"):
|
||||
del i[name]
|
||||
|
||||
for i in soup.find_all(["svg", "img", "video", "audio"]):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue