add crawler tools

2026-06-05 14:55:18 +02:00 · 2024-05-30 20:04:02 +08:00 · 2024-05-30 20:04:02 +08:00 · 4f43b905a2
commit 4f43b905a2
parent d1d44e9cea
5 changed files with 83 additions and 11 deletions
--- a/metagpt/utils/file.py
+++ b/metagpt/utils/file.py
@ -9,6 +9,7 @@
 from pathlib import Path

 import aiofiles
+from fsspec.implementations.memory import MemoryFileSystem as _MemoryFileSystem

 from metagpt.logs import logger
 from metagpt.utils.exceptions import handle_exception
@ -68,3 +69,10 @@ class File:
            content = b"".join(chunks)
            logger.debug(f"Successfully read file, the path of file: {file_path}")
            return content
+
+
+class MemoryFileSystem(_MemoryFileSystem):
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return super()._strip_protocol(str(path))
--- a/metagpt/utils/parse_html.py
+++ b/metagpt/utils/parse_html.py
@ -7,6 +7,8 @@ from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, PrivateAttr

+import htmlmin
+

 class WebPage(BaseModel):
    inner_text: str
@ -38,6 +40,22 @@ class WebPage(BaseModel):
            elif url.startswith(("http://", "https://")):
                yield urljoin(self.url, url)

+    def get_slim_soup(self, keep_links: bool = False):
+        soup = _get_soup(self.html)
+        keep_attrs = ["class"]
+        if keep_links:
+            keep_attrs.append("href")
+
+        for i in soup.find_all(True):
+            for name in list(i.attrs):
+                if i[name] and name not in keep_attrs:
+                    del i[name]
+
+        for i in soup.find_all(["svg", "img", "video", "audio"]):
+            i.decompose()
+
+        return soup
+

 def get_html_content(page: str, base: str):
    soup = _get_soup(page)
@ -48,7 +66,12 @@ def get_html_content(page: str, base: str):
 def _get_soup(page: str):
    soup = BeautifulSoup(page, "html.parser")
    # https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
-    for s in soup(["style", "script", "[document]", "head", "title"]):
+    for s in soup(["style", "script", "[document]", "head", "title", "footer"]):
        s.extract()

    return soup
+
+
+def simplify_html(html: str, url: str, keep_links: bool = False):
+    html = WebPage(inner_text="", html=html, url=url).get_slim_soup(keep_links).decode()
+    return htmlmin.minify(html, remove_comments=True, remove_empty_space=True)