diff --git a/examples/di/crawl_webpage.py b/examples/di/crawl_webpage.py index b8226f4f4..10b230f2b 100644 --- a/examples/di/crawl_webpage.py +++ b/examples/di/crawl_webpage.py @@ -6,16 +6,19 @@ """ from metagpt.roles.di.data_interpreter import DataInterpreter +from metagpt.tools.libs.browser import Browser as _ + PAPER_LIST_REQ = """" Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/, -and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables* +and save it to a csv file. paper title must include `multiagent` or `large language model`. +**Notice: view the page element before writing scraping code** """ ECOMMERCE_REQ = """ Get products data from website https://scrapeme.live/shop/ and save it as a csv file. -**Notice: Firstly parse the web page encoding and the text HTML structure; -The first page product name, price, product URL, and image URL must be saved in the csv;** +The first page product name, price, product URL, and image URL must be saved in the csv. +**Notice: view the page element before writing scraping code** """ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, **注意: 这是一个中文网站**; @@ -25,11 +28,12 @@ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 3. 反思*快讯的html内容示例*中的规律, 设计正则匹配表达式来获取*`快讯`*的标题、链接、时间; 4. 筛选最近3天的初创企业融资*`快讯`*, 以list[dict]形式打印前5个。 5. 将全部结果存在本地csv中 +**Notice: view the page element before writing scraping code** """ async def main(): - di = DataInterpreter(tools=["scrape_web_playwright"]) + di = DataInterpreter(tools=["Browser"]) await di.run(ECOMMERCE_REQ) diff --git a/metagpt/rag/engines/simple.py b/metagpt/rag/engines/simple.py index 5c5810308..623b3f350 100644 --- a/metagpt/rag/engines/simple.py +++ b/metagpt/rag/engines/simple.py @@ -4,6 +4,7 @@ import json import os from typing import Any, Optional, Union +from fsspec import AbstractFileSystem from llama_index.core import SimpleDirectoryReader, VectorStoreIndex from llama_index.core.callbacks.base import CallbackManager from llama_index.core.embeddings import BaseEmbedding @@ -83,6 +84,7 @@ class SimpleEngine(RetrieverQueryEngine): llm: LLM = None, retriever_configs: list[BaseRetrieverConfig] = None, ranker_configs: list[BaseRankerConfig] = None, + fs: Optional[AbstractFileSystem] = None, ) -> "SimpleEngine": """From docs. @@ -100,7 +102,7 @@ class SimpleEngine(RetrieverQueryEngine): if not input_dir and not input_files: raise ValueError("Must provide either `input_dir` or `input_files`.") - documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data() + documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files, fs=fs).load_data() cls._fix_document_metadata(documents) index = VectorStoreIndex.from_documents( diff --git a/metagpt/tools/libs/browser.py b/metagpt/tools/libs/browser.py index 7fde804fe..8d6daec11 100644 --- a/metagpt/tools/libs/browser.py +++ b/metagpt/tools/libs/browser.py @@ -1,9 +1,12 @@ from __future__ import annotations +import contextlib from playwright.async_api import async_playwright - +from metagpt.utils.file import MemoryFileSystem +from uuid import uuid4 from metagpt.const import DEFAULT_WORKSPACE_ROOT from metagpt.tools.tool_registry import register_tool +from metagpt.utils.parse_html import simplify_html from metagpt.utils.report import BrowserReporter @@ -35,16 +38,49 @@ class Browser: print("Now on page ", url) await self._view() - async def open_new_page(self, url: str): + async def open_new_page(self, url: str, timeout: float = 30000): """open a new page in the browser and view the page""" async with self.reporter as reporter: page = await self.browser.new_page() await reporter.async_report(url, "url") - await page.goto(url) + await page.goto(url, timeout=timeout) self.pages[url] = page await self._set_current_page(page, url) await reporter.async_report(page, "page") + async def view_page_element_to_scrape(self, requirement: str, keep_links: bool = False) -> None: + """view the HTML content of current page to understand the structure. When executed, the content will be printed out + + Args: + requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements. + keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required + """ + html = await self.current_page.content() + html = simplify_html(html, url=self.current_page.url, keep_links=keep_links) + mem_fs = MemoryFileSystem() + filename = f"{uuid4().hex}.html" + with mem_fs.open(filename, "w") as f: + f.write(html) + + # Since RAG is an optional optimization, if it fails, the simplified HTML can be used as a fallback. + with contextlib.suppress(Exception): + + from metagpt.rag.engines import SimpleEngine # avoid circular import + + # TODO make `from_docs` asynchronous + engine = SimpleEngine.from_docs(input_files=[filename], fs=mem_fs) + nodes = await engine.aretrieve(requirement) + html = "\n".join(i.text for i in nodes) + + mem_fs.rm_file(filename) + print(html) + + async def get_page_content(self) -> str: + """Get the HTML content of current page.""" + html = await self.current_page.content() + html_content = html.strip() + return html_content + async def switch_page(self, url: str): """switch to an opened page in the browser and view the page""" if url in self.pages: @@ -152,8 +188,8 @@ class Browser: async def _view(self, keep_len: int = 5000) -> str: """simulate human viewing the current page, return the visible text with links""" - visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS) - print("The visible text and their links (if any): ", visible_text_with_links[:keep_len]) + # visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS) + # print("The visible text and their links (if any): ", visible_text_with_links[:keep_len]) # html_content = await self._view_page_html(keep_len=keep_len) # print("The html content: ", html_content) diff --git a/metagpt/utils/file.py b/metagpt/utils/file.py index f62b44eb8..a8ed482d9 100644 --- a/metagpt/utils/file.py +++ b/metagpt/utils/file.py @@ -9,6 +9,7 @@ from pathlib import Path import aiofiles +from fsspec.implementations.memory import MemoryFileSystem as _MemoryFileSystem from metagpt.logs import logger from metagpt.utils.exceptions import handle_exception @@ -68,3 +69,10 @@ class File: content = b"".join(chunks) logger.debug(f"Successfully read file, the path of file: {file_path}") return content + + +class MemoryFileSystem(_MemoryFileSystem): + + @classmethod + def _strip_protocol(cls, path): + return super()._strip_protocol(str(path)) diff --git a/metagpt/utils/parse_html.py b/metagpt/utils/parse_html.py index 65aa3f236..3aac8ca6c 100644 --- a/metagpt/utils/parse_html.py +++ b/metagpt/utils/parse_html.py @@ -7,6 +7,8 @@ from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from pydantic import BaseModel, PrivateAttr +import htmlmin + class WebPage(BaseModel): inner_text: str @@ -38,6 +40,22 @@ class WebPage(BaseModel): elif url.startswith(("http://", "https://")): yield urljoin(self.url, url) + def get_slim_soup(self, keep_links: bool = False): + soup = _get_soup(self.html) + keep_attrs = ["class"] + if keep_links: + keep_attrs.append("href") + + for i in soup.find_all(True): + for name in list(i.attrs): + if i[name] and name not in keep_attrs: + del i[name] + + for i in soup.find_all(["svg", "img", "video", "audio"]): + i.decompose() + + return soup + def get_html_content(page: str, base: str): soup = _get_soup(page) @@ -48,7 +66,12 @@ def get_html_content(page: str, base: str): def _get_soup(page: str): soup = BeautifulSoup(page, "html.parser") # https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup - for s in soup(["style", "script", "[document]", "head", "title"]): + for s in soup(["style", "script", "[document]", "head", "title", "footer"]): s.extract() return soup + + +def simplify_html(html: str, url: str, keep_links: bool = False): + html = WebPage(inner_text="", html=html, url=url).get_slim_soup(keep_links).decode() + return htmlmin.minify(html, remove_comments=True, remove_empty_space=True) diff --git a/requirements.txt b/requirements.txt index b40c69c9f..83a904156 100644 --- a/requirements.txt +++ b/requirements.txt @@ -71,4 +71,6 @@ dashscope==1.14.1 rank-bm25==0.2.2 # for tool recommendation gymnasium==0.29.1 pylint~=3.0.3 -pygithub~=2.3 \ No newline at end of file +pygithub~=2.3 +htmlmin +fsspec