add experience examples for scraping task

2026-06-11 15:15:18 +02:00 · 2024-07-09 17:01:07 +08:00 · 2024-07-09 17:01:07 +08:00 · 32fc276245
commit 32fc276245
parent 0b7d7bdf55
2 changed files with 133 additions and 1 deletions
--- a/metagpt/roles/di/data_analyst.py
+++ b/metagpt/roles/di/data_analyst.py
@ -5,8 +5,11 @@ from pydantic import Field, model_validator
 from metagpt.actions.di.execute_nb_code import ExecuteNbCode
 from metagpt.actions.di.write_analysis_code import WriteAnalysisCode
 from metagpt.logs import logger
+from metagpt.prompts.di.data_analyst import BROWSER_INSTRUCTION
+from metagpt.prompts.di.role_zero import ROLE_INSTRUCTION
 from metagpt.roles.di.role_zero import RoleZero
 from metagpt.schema import TaskResult, Message
+from metagpt.strategy.experience_retriever import ExpRetriever, WebExpRetriever
 from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender
 from metagpt.tools.tool_registry import register_tool

@ -16,10 +19,12 @@ class DataAnalyst(RoleZero):
    name: str = "David"
    profile: str = "DataAnalyst"
    goal: str = "Take on any data-related tasks, such as data analysis, machine learning, deep learning, web browsing, web scraping, web searching, web deployment, terminal operation, git and github operation, etc."
+    instruction: str = ROLE_INSTRUCTION + BROWSER_INSTRUCTION

-    tools: list[str] = ["Plan", "DataAnalyst", "RoleZero"]
+    tools: list[str] = ["Plan", "DataAnalyst", "RoleZero", "Browser"]
    custom_tools: list[str] = ["machine learning", "web scraping", "Terminal"]
    custom_tool_recommender: ToolRecommender = None
+    experience_retriever: ExpRetriever = WebExpRetriever()

    use_reflection: bool = True
    write_code: WriteAnalysisCode = Field(default_factory=WriteAnalysisCode, exclude=True)
@ -63,6 +68,7 @@ class DataAnalyst(RoleZero):
                tool_info=tool_info,
                working_memory=self.rc.working_memory.get() if use_reflection else None,
                use_reflection=use_reflection,
+                browser_memory=self.browser_memory
            )
            self.rc.working_memory.add(Message(content=code, role="assistant", cause_by=WriteAnalysisCode))

--- a/metagpt/strategy/experience_retriever.py
+++ b/metagpt/strategy/experience_retriever.py
@ -448,3 +448,129 @@ Explanation: to review the code, call ReviewAndRewriteCode.run.
 ]
 ```
 """
+
+
+WEB_SCRAPING_EXAMPLE = """
+## action 1
+User Requirement: Scrap and list the restaurant names of first page by searching for the keyword `beef` on the website https://www.yelp.com/.
+Explanation: The requirement is to scrape data from a website and extract information about restaurants. The process involves searching for restaurants with a specific keyword, retrieving and presenting the data in a structured format.
+
+```json
+[
+    {
+        "command_name": "Plan.append_task",
+        "args": {
+            "task_id": "1",
+            "dependent_task_ids": [],
+            "instruction": "Navigate to the yelp website.",
+            "assignee": "Browser"
+        }
+    },
+    {
+        "command_name": "Plan.append_task",
+        "args": {
+            "task_id": "2",
+            "dependent_task_ids": ["1"],
+            "instruction": "Search for restaurants with the keyword 'beef'.",
+            "assignee": "Browser"
+        }
+    },
+    {
+        "command_name": "Plan.append_task",
+        "args": {
+            "task_id": "3",
+            "dependent_task_ids": ["2"],
+            "instruction": "View the html content of the search result page before scrap data to understand the structure.",
+            "assignee": "DataAnalyst"
+        }
+    },
+    {
+        "command_name": "Plan.append_task",
+        "args": {
+            "task_id": "4",
+            "dependent_task_ids": ["3"],
+            "instruction": "Parse the html content to scrape the restaurant names and print it.",
+            "assignee": "DataAnalyst"
+        }
+    }
+]
+```
+
+## action 2
+Explanation: To search for restaurants, I will now go to the website https://www.yelp.com/ first.
+Here is the command to navigate to the website:
+
+```json
+[
+    {
+        "command_name": "Browser.goto",
+        "args": {
+            "url": "https://www.yelp.com/"
+        }
+    }
+]
+```
+
+## action 3
+Explanation: Since the Browser has successfully navigated to the website, and I find that the element id of the search box is 53. I will finish the current task and then use the Browser tool to type the keyword `beef` in the search box and press enter.
+Here is the command to finish the current task and type the keyword:
+
+```json
+[
+    {
+        "command_name": "Plan.finish_current_task",
+        "args": {}
+    },
+    {
+        "command_name": "Browser.type",
+        "args": {
+            "element_id": 53,
+            "content": "beef",
+            "press_enter_after": true
+        }
+    }
+]
+```
+
+## action 4
+Explanation: Since the Browser has successfully search the keyword `beef`, I will finish the current task and then write code to view the html content of the page.
+Here is the command to finish the current task and view the html content:
+
+```json
+[
+    {
+        "command_name": "Plan.finish_current_task",
+        "args": {}
+    },
+    {
+        "command_name": "DataAnalyst.write_and_exec_code",
+        "args": {}
+    }
+]
+```
+
+## action 5
+Explanation: Since the DataAnalyst has successfully viewed the html content of the page, I will finish the current task and then write code to parse the html content and extract the restaurant names.
+Here is the command to finish the current task and parse the html content:
+
+```json
+[
+    {
+        "command_name": "Plan.finish_current_task",
+        "args": {}
+    },
+    {
+        "command_name": "DataAnalyst.write_and_exec_code",
+        "args": {}
+    }
+]
+
+...
+"""
+
+
+class WebExpRetriever(ExpRetriever):
+    """A simple experience retriever that returns manually crafted examples."""
+
+    def retrieve(self, context: str = "") -> str:
+        return WEB_SCRAPING_EXAMPLE