mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-02 14:45:17 +02:00
Merge branch 'feature-crawler' into 'mgx_ops'
add crawler tools See merge request pub/MetaGPT!130
This commit is contained in:
commit
0c88a092c9
6 changed files with 87 additions and 12 deletions
|
|
@ -6,16 +6,19 @@
|
|||
"""
|
||||
|
||||
from metagpt.roles.di.data_interpreter import DataInterpreter
|
||||
from metagpt.tools.libs.browser import Browser as _
|
||||
|
||||
|
||||
PAPER_LIST_REQ = """"
|
||||
Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
|
||||
and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables*
|
||||
and save it to a csv file. paper title must include `multiagent` or `large language model`.
|
||||
**Notice: view the page element before writing scraping code**
|
||||
"""
|
||||
|
||||
ECOMMERCE_REQ = """
|
||||
Get products data from website https://scrapeme.live/shop/ and save it as a csv file.
|
||||
**Notice: Firstly parse the web page encoding and the text HTML structure;
|
||||
The first page product name, price, product URL, and image URL must be saved in the csv;**
|
||||
The first page product name, price, product URL, and image URL must be saved in the csv.
|
||||
**Notice: view the page element before writing scraping code**
|
||||
"""
|
||||
|
||||
NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, **注意: 这是一个中文网站**;
|
||||
|
|
@ -25,11 +28,12 @@ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash
|
|||
3. 反思*快讯的html内容示例*中的规律, 设计正则匹配表达式来获取*`快讯`*的标题、链接、时间;
|
||||
4. 筛选最近3天的初创企业融资*`快讯`*, 以list[dict]形式打印前5个。
|
||||
5. 将全部结果存在本地csv中
|
||||
**Notice: view the page element before writing scraping code**
|
||||
"""
|
||||
|
||||
|
||||
async def main():
|
||||
di = DataInterpreter(tools=["scrape_web_playwright"])
|
||||
di = DataInterpreter(tools=["Browser"])
|
||||
|
||||
await di.run(ECOMMERCE_REQ)
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import json
|
|||
import os
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
|
||||
from llama_index.core.callbacks.base import CallbackManager
|
||||
from llama_index.core.embeddings import BaseEmbedding
|
||||
|
|
@ -83,6 +84,7 @@ class SimpleEngine(RetrieverQueryEngine):
|
|||
llm: LLM = None,
|
||||
retriever_configs: list[BaseRetrieverConfig] = None,
|
||||
ranker_configs: list[BaseRankerConfig] = None,
|
||||
fs: Optional[AbstractFileSystem] = None,
|
||||
) -> "SimpleEngine":
|
||||
"""From docs.
|
||||
|
||||
|
|
@ -100,7 +102,7 @@ class SimpleEngine(RetrieverQueryEngine):
|
|||
if not input_dir and not input_files:
|
||||
raise ValueError("Must provide either `input_dir` or `input_files`.")
|
||||
|
||||
documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data()
|
||||
documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files, fs=fs).load_data()
|
||||
cls._fix_document_metadata(documents)
|
||||
|
||||
index = VectorStoreIndex.from_documents(
|
||||
|
|
|
|||
|
|
@ -1,9 +1,12 @@
|
|||
from __future__ import annotations
|
||||
import contextlib
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from metagpt.utils.file import MemoryFileSystem
|
||||
from uuid import uuid4
|
||||
from metagpt.const import DEFAULT_WORKSPACE_ROOT
|
||||
from metagpt.tools.tool_registry import register_tool
|
||||
from metagpt.utils.parse_html import simplify_html
|
||||
from metagpt.utils.report import BrowserReporter
|
||||
|
||||
|
||||
|
|
@ -35,16 +38,49 @@ class Browser:
|
|||
print("Now on page ", url)
|
||||
await self._view()
|
||||
|
||||
async def open_new_page(self, url: str):
|
||||
async def open_new_page(self, url: str, timeout: float = 30000):
|
||||
"""open a new page in the browser and view the page"""
|
||||
async with self.reporter as reporter:
|
||||
page = await self.browser.new_page()
|
||||
await reporter.async_report(url, "url")
|
||||
await page.goto(url)
|
||||
await page.goto(url, timeout=timeout)
|
||||
self.pages[url] = page
|
||||
await self._set_current_page(page, url)
|
||||
await reporter.async_report(page, "page")
|
||||
|
||||
async def view_page_element_to_scrape(self, requirement: str, keep_links: bool = False) -> None:
|
||||
"""view the HTML content of current page to understand the structure. When executed, the content will be printed out
|
||||
|
||||
Args:
|
||||
requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements.
|
||||
keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required
|
||||
"""
|
||||
html = await self.current_page.content()
|
||||
html = simplify_html(html, url=self.current_page.url, keep_links=keep_links)
|
||||
mem_fs = MemoryFileSystem()
|
||||
filename = f"{uuid4().hex}.html"
|
||||
with mem_fs.open(filename, "w") as f:
|
||||
f.write(html)
|
||||
|
||||
# Since RAG is an optional optimization, if it fails, the simplified HTML can be used as a fallback.
|
||||
with contextlib.suppress(Exception):
|
||||
|
||||
from metagpt.rag.engines import SimpleEngine # avoid circular import
|
||||
|
||||
# TODO make `from_docs` asynchronous
|
||||
engine = SimpleEngine.from_docs(input_files=[filename], fs=mem_fs)
|
||||
nodes = await engine.aretrieve(requirement)
|
||||
html = "\n".join(i.text for i in nodes)
|
||||
|
||||
mem_fs.rm_file(filename)
|
||||
print(html)
|
||||
|
||||
async def get_page_content(self) -> str:
|
||||
"""Get the HTML content of current page."""
|
||||
html = await self.current_page.content()
|
||||
html_content = html.strip()
|
||||
return html_content
|
||||
|
||||
async def switch_page(self, url: str):
|
||||
"""switch to an opened page in the browser and view the page"""
|
||||
if url in self.pages:
|
||||
|
|
@ -152,8 +188,8 @@ class Browser:
|
|||
|
||||
async def _view(self, keep_len: int = 5000) -> str:
|
||||
"""simulate human viewing the current page, return the visible text with links"""
|
||||
visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS)
|
||||
print("The visible text and their links (if any): ", visible_text_with_links[:keep_len])
|
||||
# visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS)
|
||||
# print("The visible text and their links (if any): ", visible_text_with_links[:keep_len])
|
||||
# html_content = await self._view_page_html(keep_len=keep_len)
|
||||
# print("The html content: ", html_content)
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
from pathlib import Path
|
||||
|
||||
import aiofiles
|
||||
from fsspec.implementations.memory import MemoryFileSystem as _MemoryFileSystem
|
||||
|
||||
from metagpt.logs import logger
|
||||
from metagpt.utils.exceptions import handle_exception
|
||||
|
|
@ -68,3 +69,10 @@ class File:
|
|||
content = b"".join(chunks)
|
||||
logger.debug(f"Successfully read file, the path of file: {file_path}")
|
||||
return content
|
||||
|
||||
|
||||
class MemoryFileSystem(_MemoryFileSystem):
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return super()._strip_protocol(str(path))
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ from urllib.parse import urljoin, urlparse
|
|||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel, PrivateAttr
|
||||
|
||||
import htmlmin
|
||||
|
||||
|
||||
class WebPage(BaseModel):
|
||||
inner_text: str
|
||||
|
|
@ -38,6 +40,22 @@ class WebPage(BaseModel):
|
|||
elif url.startswith(("http://", "https://")):
|
||||
yield urljoin(self.url, url)
|
||||
|
||||
def get_slim_soup(self, keep_links: bool = False):
|
||||
soup = _get_soup(self.html)
|
||||
keep_attrs = ["class"]
|
||||
if keep_links:
|
||||
keep_attrs.append("href")
|
||||
|
||||
for i in soup.find_all(True):
|
||||
for name in list(i.attrs):
|
||||
if i[name] and name not in keep_attrs:
|
||||
del i[name]
|
||||
|
||||
for i in soup.find_all(["svg", "img", "video", "audio"]):
|
||||
i.decompose()
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
def get_html_content(page: str, base: str):
|
||||
soup = _get_soup(page)
|
||||
|
|
@ -48,7 +66,12 @@ def get_html_content(page: str, base: str):
|
|||
def _get_soup(page: str):
|
||||
soup = BeautifulSoup(page, "html.parser")
|
||||
# https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
|
||||
for s in soup(["style", "script", "[document]", "head", "title"]):
|
||||
for s in soup(["style", "script", "[document]", "head", "title", "footer"]):
|
||||
s.extract()
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
def simplify_html(html: str, url: str, keep_links: bool = False):
|
||||
html = WebPage(inner_text="", html=html, url=url).get_slim_soup(keep_links).decode()
|
||||
return htmlmin.minify(html, remove_comments=True, remove_empty_space=True)
|
||||
|
|
|
|||
|
|
@ -71,4 +71,6 @@ dashscope==1.14.1
|
|||
rank-bm25==0.2.2 # for tool recommendation
|
||||
gymnasium==0.29.1
|
||||
pylint~=3.0.3
|
||||
pygithub~=2.3
|
||||
pygithub~=2.3
|
||||
htmlmin
|
||||
fsspec
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue