Merge branch 'data_analyst_ldy' into 'mgx_ops'

Data analyst ldy See merge request pub/MetaGPT!189
2026-05-03 04:42:38 +02:00 · 2024-07-15 06:55:52 +00:00 · 2024-07-15 06:55:52 +00:00 · e2ce006d15
commit e2ce006d15
parent 123850777f b214e49733
17 changed files with 376 additions and 198 deletions
--- a/metagpt/strategy/experience_retriever.py
+++ b/metagpt/strategy/experience_retriever.py
@ -629,6 +629,8 @@ class KeywordExpRetriever(ExpRetriever):
                return DEPLOY_EXAMPLE
            elif "issue" in context.lower():
                return FIX_ISSUE_EXAMPLE
+            elif "https:" or "http:" in context.lower():
+                return WEB_SCRAPING_EXAMPLE
        elif exp_type == "task":
            if "diagnose" in context.lower():
                return SEARCH_SYMBOL_EXAMPLE
@ -890,3 +892,122 @@ Explanation: to review the code, call ReviewAndRewriteCode.run.
 ]
 ```
 """
+
+
+WEB_SCRAPING_EXAMPLE = """
+## action 1
+User Requirement: Scrap and list the restaurant names of first page by searching for the keyword `beef` on the website https://www.yelp.com/.
+Explanation: The requirement is to scrape data from a website and extract information about restaurants. The process involves searching for restaurants with a specific keyword, retrieving and presenting the data in a structured format.
+
+```json
+[
+    {
+        "command_name": "Plan.append_task",
+        "args": {
+            "task_id": "1",
+            "dependent_task_ids": [],
+            "instruction": "Navigate to the yelp website.",
+            "assignee": "Browser"
+        }
+    },
+    {
+        "command_name": "Plan.append_task",
+        "args": {
+            "task_id": "2",
+            "dependent_task_ids": ["1"],
+            "instruction": "Search for restaurants with the keyword 'beef'.",
+            "assignee": "Browser"
+        }
+    },
+    {
+        "command_name": "Plan.append_task",
+        "args": {
+            "task_id": "3",
+            "dependent_task_ids": ["2"],
+            "instruction": "View the html content of the search result page before scrap data to understand the structure.",
+            "assignee": "DataAnalyst"
+        }
+    },
+    {
+        "command_name": "Plan.append_task",
+        "args": {
+            "task_id": "4",
+            "dependent_task_ids": ["3"],
+            "instruction": "Parse the html content to scrape the restaurant names and print it.",
+            "assignee": "DataAnalyst"
+        }
+    }
+]
+```
+
+## action 2
+Explanation: To search for restaurants, I will now go to the website https://www.yelp.com/ first.
+Here is the command to navigate to the website:
+
+```json
+[
+    {
+        "command_name": "Browser.goto",
+        "args": {
+            "url": "https://www.yelp.com/"
+        }
+    }
+]
+```
+
+## action 3
+Explanation: Since the Browser has successfully navigated to the website, and I find that the element id of the search box is 53. I will finish the current task and then use the Browser tool to type the keyword `beef` in the search box and press enter.
+Here is the command to finish the current task and type the keyword:
+
+```json
+[
+    {
+        "command_name": "Plan.finish_current_task",
+        "args": {}
+    },
+    {
+        "command_name": "Browser.type",
+        "args": {
+            "element_id": 53,
+            "content": "beef",
+            "press_enter_after": true
+        }
+    }
+]
+```
+
+## action 4
+Explanation: Since the Browser has successfully search the keyword `beef`, I will finish the current task and then write code to view the html content of the page.
+Here is the command to finish the current task and view the html content:
+
+```json
+[
+    {
+        "command_name": "Plan.finish_current_task",
+        "args": {}
+    },
+    {
+        "command_name": "DataAnalyst.write_and_exec_code",
+        "args": {}
+    }
+]
+```
+
+## action 5
+Explanation: Since the DataAnalyst has successfully viewed the html content of the page, I will finish the current task and then write code to parse the html content and extract the restaurant names.
+Here is the command to finish the current task and parse the html content:
+
+```json
+[
+    {
+        "command_name": "Plan.finish_current_task",
+        "args": {}
+    },
+    {
+        "command_name": "DataAnalyst.write_and_exec_code",
+        "args": {}
+    }
+]
+
+...
+"""
--- a/metagpt/strategy/planner.py
+++ b/metagpt/strategy/planner.py
@ -40,8 +40,14 @@ PLAN_STATUS = """
 ## Current Task
 {current_task}

+## Finished Section of Current Task
+### code
+{current_task_code}
+### execution result
+{current_task_result}
+
 ## Task Guidance
-Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
+Write code for the incomplete sections of 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
 Specifically, {guidance}
 """

@ -173,6 +179,8 @@ class Planner(BaseModel):
            code_written=code_written,
            task_results=task_results,
            current_task=self.current_task.instruction,
+            current_task_code=self.current_task.code if self.current_task.code else "",
+            current_task_result=self.current_task.result if self.current_task.result else "",
            guidance=guidance,
        )

--- a/metagpt/strategy/task_type.py
+++ b/metagpt/strategy/task_type.py
@ -8,7 +8,7 @@ from metagpt.prompts.task_type import (
    FEATURE_ENGINEERING_PROMPT,
    IMAGE2WEBPAGE_PROMPT,
    MODEL_EVALUATE_PROMPT,
-    MODEL_TRAIN_PROMPT,
+    MODEL_TRAIN_PROMPT, WEB_SCRAPING_PROMPT,
 )


@ -62,6 +62,7 @@ class TaskType(Enum):
    WEBSCRAPING = TaskTypeDef(
        name="web scraping",
        desc="For scraping data from web pages.",
+        guidance=WEB_SCRAPING_PROMPT,
    )
    EMAIL_LOGIN = TaskTypeDef(
        name="email login",