Merge branch 'feature-crawler' into 'mgx_ops'

add crawler tools

See merge request pub/MetaGPT!130
This commit is contained in:
林义章 2024-05-31 08:15:57 +00:00
commit 0c88a092c9
6 changed files with 87 additions and 12 deletions

View file

@ -6,16 +6,19 @@
"""
from metagpt.roles.di.data_interpreter import DataInterpreter
from metagpt.tools.libs.browser import Browser as _
PAPER_LIST_REQ = """"
Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables*
and save it to a csv file. paper title must include `multiagent` or `large language model`.
**Notice: view the page element before writing scraping code**
"""
ECOMMERCE_REQ = """
Get products data from website https://scrapeme.live/shop/ and save it as a csv file.
**Notice: Firstly parse the web page encoding and the text HTML structure;
The first page product name, price, product URL, and image URL must be saved in the csv;**
The first page product name, price, product URL, and image URL must be saved in the csv.
**Notice: view the page element before writing scraping code**
"""
NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, **注意: 这是一个中文网站**;
@ -25,11 +28,12 @@ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash
3. 反思*快讯的html内容示例*中的规律, 设计正则匹配表达式来获取*`快讯`*的标题链接时间;
4. 筛选最近3天的初创企业融资*`快讯`*, 以list[dict]形式打印前5个
5. 将全部结果存在本地csv中
**Notice: view the page element before writing scraping code**
"""
async def main():
di = DataInterpreter(tools=["scrape_web_playwright"])
di = DataInterpreter(tools=["Browser"])
await di.run(ECOMMERCE_REQ)

View file

@ -4,6 +4,7 @@ import json
import os
from typing import Any, Optional, Union
from fsspec import AbstractFileSystem
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.callbacks.base import CallbackManager
from llama_index.core.embeddings import BaseEmbedding
@ -83,6 +84,7 @@ class SimpleEngine(RetrieverQueryEngine):
llm: LLM = None,
retriever_configs: list[BaseRetrieverConfig] = None,
ranker_configs: list[BaseRankerConfig] = None,
fs: Optional[AbstractFileSystem] = None,
) -> "SimpleEngine":
"""From docs.
@ -100,7 +102,7 @@ class SimpleEngine(RetrieverQueryEngine):
if not input_dir and not input_files:
raise ValueError("Must provide either `input_dir` or `input_files`.")
documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data()
documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files, fs=fs).load_data()
cls._fix_document_metadata(documents)
index = VectorStoreIndex.from_documents(

View file

@ -1,9 +1,12 @@
from __future__ import annotations
import contextlib
from playwright.async_api import async_playwright
from metagpt.utils.file import MemoryFileSystem
from uuid import uuid4
from metagpt.const import DEFAULT_WORKSPACE_ROOT
from metagpt.tools.tool_registry import register_tool
from metagpt.utils.parse_html import simplify_html
from metagpt.utils.report import BrowserReporter
@ -35,16 +38,49 @@ class Browser:
print("Now on page ", url)
await self._view()
async def open_new_page(self, url: str):
async def open_new_page(self, url: str, timeout: float = 30000):
"""open a new page in the browser and view the page"""
async with self.reporter as reporter:
page = await self.browser.new_page()
await reporter.async_report(url, "url")
await page.goto(url)
await page.goto(url, timeout=timeout)
self.pages[url] = page
await self._set_current_page(page, url)
await reporter.async_report(page, "page")
async def view_page_element_to_scrape(self, requirement: str, keep_links: bool = False) -> None:
"""view the HTML content of current page to understand the structure. When executed, the content will be printed out
Args:
requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements.
keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required
"""
html = await self.current_page.content()
html = simplify_html(html, url=self.current_page.url, keep_links=keep_links)
mem_fs = MemoryFileSystem()
filename = f"{uuid4().hex}.html"
with mem_fs.open(filename, "w") as f:
f.write(html)
# Since RAG is an optional optimization, if it fails, the simplified HTML can be used as a fallback.
with contextlib.suppress(Exception):
from metagpt.rag.engines import SimpleEngine # avoid circular import
# TODO make `from_docs` asynchronous
engine = SimpleEngine.from_docs(input_files=[filename], fs=mem_fs)
nodes = await engine.aretrieve(requirement)
html = "\n".join(i.text for i in nodes)
mem_fs.rm_file(filename)
print(html)
async def get_page_content(self) -> str:
"""Get the HTML content of current page."""
html = await self.current_page.content()
html_content = html.strip()
return html_content
async def switch_page(self, url: str):
"""switch to an opened page in the browser and view the page"""
if url in self.pages:
@ -152,8 +188,8 @@ class Browser:
async def _view(self, keep_len: int = 5000) -> str:
"""simulate human viewing the current page, return the visible text with links"""
visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS)
print("The visible text and their links (if any): ", visible_text_with_links[:keep_len])
# visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS)
# print("The visible text and their links (if any): ", visible_text_with_links[:keep_len])
# html_content = await self._view_page_html(keep_len=keep_len)
# print("The html content: ", html_content)

View file

@ -9,6 +9,7 @@
from pathlib import Path
import aiofiles
from fsspec.implementations.memory import MemoryFileSystem as _MemoryFileSystem
from metagpt.logs import logger
from metagpt.utils.exceptions import handle_exception
@ -68,3 +69,10 @@ class File:
content = b"".join(chunks)
logger.debug(f"Successfully read file, the path of file: {file_path}")
return content
class MemoryFileSystem(_MemoryFileSystem):
@classmethod
def _strip_protocol(cls, path):
return super()._strip_protocol(str(path))

View file

@ -7,6 +7,8 @@ from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from pydantic import BaseModel, PrivateAttr
import htmlmin
class WebPage(BaseModel):
inner_text: str
@ -38,6 +40,22 @@ class WebPage(BaseModel):
elif url.startswith(("http://", "https://")):
yield urljoin(self.url, url)
def get_slim_soup(self, keep_links: bool = False):
soup = _get_soup(self.html)
keep_attrs = ["class"]
if keep_links:
keep_attrs.append("href")
for i in soup.find_all(True):
for name in list(i.attrs):
if i[name] and name not in keep_attrs:
del i[name]
for i in soup.find_all(["svg", "img", "video", "audio"]):
i.decompose()
return soup
def get_html_content(page: str, base: str):
soup = _get_soup(page)
@ -48,7 +66,12 @@ def get_html_content(page: str, base: str):
def _get_soup(page: str):
soup = BeautifulSoup(page, "html.parser")
# https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
for s in soup(["style", "script", "[document]", "head", "title"]):
for s in soup(["style", "script", "[document]", "head", "title", "footer"]):
s.extract()
return soup
def simplify_html(html: str, url: str, keep_links: bool = False):
html = WebPage(inner_text="", html=html, url=url).get_slim_soup(keep_links).decode()
return htmlmin.minify(html, remove_comments=True, remove_empty_space=True)

View file

@ -71,4 +71,6 @@ dashscope==1.14.1
rank-bm25==0.2.2 # for tool recommendation
gymnasium==0.29.1
pylint~=3.0.3
pygithub~=2.3
pygithub~=2.3
htmlmin
fsspec