mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-05 14:55:18 +02:00
add crawler tools
This commit is contained in:
parent
d1d44e9cea
commit
4f43b905a2
5 changed files with 83 additions and 11 deletions
|
|
@ -9,6 +9,7 @@
|
|||
from pathlib import Path
|
||||
|
||||
import aiofiles
|
||||
from fsspec.implementations.memory import MemoryFileSystem as _MemoryFileSystem
|
||||
|
||||
from metagpt.logs import logger
|
||||
from metagpt.utils.exceptions import handle_exception
|
||||
|
|
@ -68,3 +69,10 @@ class File:
|
|||
content = b"".join(chunks)
|
||||
logger.debug(f"Successfully read file, the path of file: {file_path}")
|
||||
return content
|
||||
|
||||
|
||||
class MemoryFileSystem(_MemoryFileSystem):
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return super()._strip_protocol(str(path))
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ from urllib.parse import urljoin, urlparse
|
|||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel, PrivateAttr
|
||||
|
||||
import htmlmin
|
||||
|
||||
|
||||
class WebPage(BaseModel):
|
||||
inner_text: str
|
||||
|
|
@ -38,6 +40,22 @@ class WebPage(BaseModel):
|
|||
elif url.startswith(("http://", "https://")):
|
||||
yield urljoin(self.url, url)
|
||||
|
||||
def get_slim_soup(self, keep_links: bool = False):
|
||||
soup = _get_soup(self.html)
|
||||
keep_attrs = ["class"]
|
||||
if keep_links:
|
||||
keep_attrs.append("href")
|
||||
|
||||
for i in soup.find_all(True):
|
||||
for name in list(i.attrs):
|
||||
if i[name] and name not in keep_attrs:
|
||||
del i[name]
|
||||
|
||||
for i in soup.find_all(["svg", "img", "video", "audio"]):
|
||||
i.decompose()
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
def get_html_content(page: str, base: str):
|
||||
soup = _get_soup(page)
|
||||
|
|
@ -48,7 +66,12 @@ def get_html_content(page: str, base: str):
|
|||
def _get_soup(page: str):
|
||||
soup = BeautifulSoup(page, "html.parser")
|
||||
# https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
|
||||
for s in soup(["style", "script", "[document]", "head", "title"]):
|
||||
for s in soup(["style", "script", "[document]", "head", "title", "footer"]):
|
||||
s.extract()
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
def simplify_html(html: str, url: str, keep_links: bool = False):
|
||||
html = WebPage(inner_text="", html=html, url=url).get_slim_soup(keep_links).decode()
|
||||
return htmlmin.minify(html, remove_comments=True, remove_empty_space=True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue