add crawler tools

2026-06-14 15:25:17 +02:00 · 2024-05-30 20:04:02 +08:00 · 2024-05-30 20:04:02 +08:00 · 4f43b905a2
commit 4f43b905a2
parent d1d44e9cea
5 changed files with 83 additions and 11 deletions
--- a/examples/di/crawl_webpage.py
+++ b/examples/di/crawl_webpage.py
@ -6,16 +6,19 @@
 """

 from metagpt.roles.di.data_interpreter import DataInterpreter
+from metagpt.tools.libs.browser import Browser as _
+

 PAPER_LIST_REQ = """"
 Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
-and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables*
+and save it to a csv file. paper title must include `multiagent` or `large language model`.
+**Notice: view the page element before writing scraping code**
 """

 ECOMMERCE_REQ = """
 Get products data from website https://scrapeme.live/shop/ and save it as a csv file.
-**Notice: Firstly parse the web page encoding and the text HTML structure;
-The first page product name, price, product URL, and image URL must be saved in the csv;**
+The first page product name, price, product URL, and image URL must be saved in the csv.
+**Notice: view the page element before writing scraping code**
 """

 NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, **注意: 这是一个中文网站**;
@ -25,11 +28,12 @@ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash
 3. 反思*快讯的html内容示例*中的规律, 设计正则匹配表达式来获取*`快讯`*的标题、链接、时间;
 4. 筛选最近3天的初创企业融资*`快讯`*, 以list[dict]形式打印前5个。
 5. 将全部结果存在本地csv中
+**Notice: view the page element before writing scraping code**
 """


 async def main():
-    di = DataInterpreter(tools=["scrape_web_playwright"])
+    di = DataInterpreter(tools=["Browser"])

    await di.run(ECOMMERCE_REQ)