mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-04-28 10:26:32 +02:00
refine web_scraping tool
This commit is contained in:
parent
4c4d9547ff
commit
ae861d99cd
2 changed files with 8 additions and 6 deletions
|
|
@ -41,13 +41,13 @@ class WebPage(BaseModel):
|
|||
|
||||
def get_slim_soup(self, keep_links: bool = False):
|
||||
soup = _get_soup(self.html)
|
||||
keep_attrs = ["class"]
|
||||
keep_attrs = ["class", "id"]
|
||||
if keep_links:
|
||||
keep_attrs.append("href")
|
||||
keep_attrs.extend(["href", "title"])
|
||||
|
||||
for i in soup.find_all(True):
|
||||
for name in list(i.attrs):
|
||||
if i[name] and name not in keep_attrs:
|
||||
if i[name] and name not in keep_attrs and not name.startswith("data-"):
|
||||
del i[name]
|
||||
|
||||
for i in soup.find_all(["svg", "img", "video", "audio"]):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue