Merge branch 'feature-crawler' into 'mgx_ops'

add crawler tools See merge request pub/MetaGPT!130
2026-07-17 16:41:05 +02:00 · 2024-05-31 08:15:57 +00:00 · 2024-05-31 08:15:57 +00:00 · 0c88a092c9
commit 0c88a092c9
parent d1d44e9cea 9dc5212d47
6 changed files with 87 additions and 12 deletions
--- a/examples/di/crawl_webpage.py
+++ b/examples/di/crawl_webpage.py
@ -6,16 +6,19 @@
 """

 from metagpt.roles.di.data_interpreter import DataInterpreter
+from metagpt.tools.libs.browser import Browser as _
+

 PAPER_LIST_REQ = """"
 Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
-and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables*
+and save it to a csv file. paper title must include `multiagent` or `large language model`.
+**Notice: view the page element before writing scraping code**
 """

 ECOMMERCE_REQ = """
 Get products data from website https://scrapeme.live/shop/ and save it as a csv file.
-**Notice: Firstly parse the web page encoding and the text HTML structure;
-The first page product name, price, product URL, and image URL must be saved in the csv;**
+The first page product name, price, product URL, and image URL must be saved in the csv.
+**Notice: view the page element before writing scraping code**
 """

 NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, **注意: 这是一个中文网站**;
@ -25,11 +28,12 @@ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash
 3. 反思*快讯的html内容示例*中的规律, 设计正则匹配表达式来获取*`快讯`*的标题、链接、时间;
 4. 筛选最近3天的初创企业融资*`快讯`*, 以list[dict]形式打印前5个。
 5. 将全部结果存在本地csv中
+**Notice: view the page element before writing scraping code**
 """


 async def main():
-    di = DataInterpreter(tools=["scrape_web_playwright"])
+    di = DataInterpreter(tools=["Browser"])

    await di.run(ECOMMERCE_REQ)

--- a/metagpt/rag/engines/simple.py
+++ b/metagpt/rag/engines/simple.py
@ -4,6 +4,7 @@ import json
 import os
 from typing import Any, Optional, Union

+from fsspec import AbstractFileSystem
 from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
 from llama_index.core.callbacks.base import CallbackManager
 from llama_index.core.embeddings import BaseEmbedding
@ -83,6 +84,7 @@ class SimpleEngine(RetrieverQueryEngine):
        llm: LLM = None,
        retriever_configs: list[BaseRetrieverConfig] = None,
        ranker_configs: list[BaseRankerConfig] = None,
+        fs: Optional[AbstractFileSystem] = None,
    ) -> "SimpleEngine":
        """From docs.

@ -100,7 +102,7 @@ class SimpleEngine(RetrieverQueryEngine):
        if not input_dir and not input_files:
            raise ValueError("Must provide either `input_dir` or `input_files`.")

-        documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data()
+        documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files, fs=fs).load_data()
        cls._fix_document_metadata(documents)

        index = VectorStoreIndex.from_documents(
--- a/metagpt/tools/libs/browser.py
+++ b/metagpt/tools/libs/browser.py
@ -1,9 +1,12 @@
 from __future__ import annotations
+import contextlib

 from playwright.async_api import async_playwright
-
+from metagpt.utils.file import MemoryFileSystem
+from uuid import uuid4
 from metagpt.const import DEFAULT_WORKSPACE_ROOT
 from metagpt.tools.tool_registry import register_tool
+from metagpt.utils.parse_html import simplify_html
 from metagpt.utils.report import BrowserReporter


@ -35,16 +38,49 @@ class Browser:
        print("Now on page ", url)
        await self._view()

-    async def open_new_page(self, url: str):
+    async def open_new_page(self, url: str, timeout: float = 30000):
        """open a new page in the browser and view the page"""
        async with self.reporter as reporter:
            page = await self.browser.new_page()
            await reporter.async_report(url, "url")
-            await page.goto(url)
+            await page.goto(url, timeout=timeout)
            self.pages[url] = page
            await self._set_current_page(page, url)
            await reporter.async_report(page, "page")

+    async def view_page_element_to_scrape(self, requirement: str, keep_links: bool = False) -> None:
+        """view the HTML content of current page to understand the structure. When executed, the content will be printed out
+
+        Args:
+            requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements.
+            keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required
+        """
+        html = await self.current_page.content()
+        html = simplify_html(html, url=self.current_page.url, keep_links=keep_links)
+        mem_fs = MemoryFileSystem()
+        filename = f"{uuid4().hex}.html"
+        with mem_fs.open(filename, "w") as f:
+            f.write(html)
+
+        # Since RAG is an optional optimization, if it fails, the simplified HTML can be used as a fallback.
+        with contextlib.suppress(Exception):
+
+            from metagpt.rag.engines import SimpleEngine  # avoid circular import
+
+            # TODO make `from_docs` asynchronous
+            engine = SimpleEngine.from_docs(input_files=[filename], fs=mem_fs)
+            nodes = await engine.aretrieve(requirement)
+            html = "\n".join(i.text for i in nodes)
+
+        mem_fs.rm_file(filename)
+        print(html)
+
+    async def get_page_content(self) -> str:
+        """Get the HTML content of current page."""
+        html = await self.current_page.content()
+        html_content = html.strip()
+        return html_content
+
    async def switch_page(self, url: str):
        """switch to an opened page in the browser and view the page"""
        if url in self.pages:
@ -152,8 +188,8 @@ class Browser:

    async def _view(self, keep_len: int = 5000) -> str:
        """simulate human viewing the current page, return the visible text with links"""
-        visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS)
-        print("The visible text and their links (if any): ", visible_text_with_links[:keep_len])
+        # visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS)
+        # print("The visible text and their links (if any): ", visible_text_with_links[:keep_len])
        # html_content = await self._view_page_html(keep_len=keep_len)
        # print("The html content: ", html_content)

--- a/metagpt/utils/file.py
+++ b/metagpt/utils/file.py
@ -9,6 +9,7 @@
 from pathlib import Path

 import aiofiles
+from fsspec.implementations.memory import MemoryFileSystem as _MemoryFileSystem

 from metagpt.logs import logger
 from metagpt.utils.exceptions import handle_exception
@ -68,3 +69,10 @@ class File:
            content = b"".join(chunks)
            logger.debug(f"Successfully read file, the path of file: {file_path}")
            return content
+
+
+class MemoryFileSystem(_MemoryFileSystem):
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return super()._strip_protocol(str(path))
--- a/metagpt/utils/parse_html.py
+++ b/metagpt/utils/parse_html.py
@ -7,6 +7,8 @@ from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, PrivateAttr

+import htmlmin
+

 class WebPage(BaseModel):
    inner_text: str
@ -38,6 +40,22 @@ class WebPage(BaseModel):
            elif url.startswith(("http://", "https://")):
                yield urljoin(self.url, url)

+    def get_slim_soup(self, keep_links: bool = False):
+        soup = _get_soup(self.html)
+        keep_attrs = ["class"]
+        if keep_links:
+            keep_attrs.append("href")
+
+        for i in soup.find_all(True):
+            for name in list(i.attrs):
+                if i[name] and name not in keep_attrs:
+                    del i[name]
+
+        for i in soup.find_all(["svg", "img", "video", "audio"]):
+            i.decompose()
+
+        return soup
+

 def get_html_content(page: str, base: str):
    soup = _get_soup(page)
@ -48,7 +66,12 @@ def get_html_content(page: str, base: str):
 def _get_soup(page: str):
    soup = BeautifulSoup(page, "html.parser")
    # https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
-    for s in soup(["style", "script", "[document]", "head", "title"]):
+    for s in soup(["style", "script", "[document]", "head", "title", "footer"]):
        s.extract()

    return soup
+
+
+def simplify_html(html: str, url: str, keep_links: bool = False):
+    html = WebPage(inner_text="", html=html, url=url).get_slim_soup(keep_links).decode()
+    return htmlmin.minify(html, remove_comments=True, remove_empty_space=True)
--- a/requirements.txt
+++ b/requirements.txt
@ -71,4 +71,6 @@ dashscope==1.14.1
 rank-bm25==0.2.2  # for tool recommendation
 gymnasium==0.29.1
 pylint~=3.0.3
-pygithub~=2.3
+pygithub~=2.3
+htmlmin
+fsspec