Merge branch 'data_analyst_ldy' into 'mgx_ops'

Data analyst ldy

See merge request pub/MetaGPT!189
This commit is contained in:
林义章 2024-07-15 06:55:52 +00:00
commit e2ce006d15
17 changed files with 376 additions and 198 deletions

View file

@ -30,6 +30,12 @@ from metagpt.logs import logger
from metagpt.utils.report import NotebookReporter
INSTALL_KEEPLEN = 500
INI_CODE = """import warnings
import logging
root_logger = logging.getLogger()
root_logger.setLevel(logging.ERROR)
warnings.filterwarnings('ignore')"""
class RealtimeOutputNotebookClient(NotebookClient):
@ -79,6 +85,12 @@ class ExecuteNbCode(Action):
)
self.reporter = NotebookReporter()
self.set_nb_client()
self.init_called = False
async def init_code(self):
if not self.init_called:
await self.run(INI_CODE)
self.init_called = True
def set_nb_client(self):
self.nb_client = RealtimeOutputNotebookClient(
@ -175,9 +187,12 @@ class ExecuteNbCode(Action):
is_success = False
output_text = remove_escape_and_color_codes(output_text)
if is_success:
output_text = remove_log_and_warning_lines(output_text)
# The useful information of the exception is at the end,
# the useful information of normal output is at the begining.
output_text = output_text[:keep_len] if is_success else output_text[-keep_len:]
if '<!DOCTYPE html>' not in output_text:
output_text = output_text[:keep_len] if is_success else output_text[-keep_len:]
parsed_output.append(output_text)
return is_success, ",".join(parsed_output)
@ -268,6 +283,18 @@ class ExecuteNbCode(Action):
return outputs, success
def remove_log_and_warning_lines(input_str: str) -> str:
delete_lines = ["[warning]", "warning:", "[cv]", "[info]"]
result = "\n".join(
[
line
for line in input_str.split("\n")
if not any(dl in line.lower() for dl in delete_lines)
]
).strip()
return result
def remove_escape_and_color_codes(input_str: str):
# 使用正则表达式去除jupyter notebook输出结果中的转义字符和颜色代码
# Use regular expressions to get rid of escape characters and color codes in jupyter notebook output.

View file

@ -57,7 +57,7 @@ class WriteAnalysisCode(Action):
code = await self._debug_with_reflection(context=context, working_memory=working_memory)
else:
rsp = await self.llm.aask(context, system_msgs=[INTERPRETER_SYSTEM_MSG], **kwargs)
code = CodeParser.parse_code(text=rsp)
code = CodeParser.parse_code(text=rsp, lang="python")
return code

View file

@ -1,44 +1,26 @@
CMD_PROMPT = """
# Data Structure
class Task(BaseModel):
task_id: str = ""
dependent_task_ids: list[str] = []
instruction: str = ""
task_type: str = ""
assignee: str = "David"
from metagpt.strategy.task_type import TaskType
# Available Commands
{available_commands}
# Current Plan
{plan_status}
# Example
{example}
# Instructions
Based on the context, write a plan or modify an existing plan to achieve the goal. A plan consists of one to 3 tasks.
If plan is created, you should track the progress and update the plan accordingly, such as finish_current_task, append_task, reset_task, replace_task, etc.
Pay close attention to new user message, review the conversation history, use reply_to_human to respond to new user requirement.
Note:
1. If you keeping encountering errors, unexpected situation, or you are not sure of proceeding, use ask_human to ask for help.
2. Carefully review your progress at the current task, if your actions so far has not fulfilled the task instruction, you should continue with current task. Otherwise, finish current task.
3. Each time you finish a task, use reply_to_human to report your progress.
Pay close attention to the Example provided, you can reuse the example for your current situation if it fits.
You may use any of the available commands to create a plan or update the plan. You may output mutiple commands, they will be executed sequentially.
If you finish current task, you will automatically take the next task in the existing plan, use finish_task, DON'T append a new task.
# Your commands in a json array, in the following output format, always output a json array, if there is nothing to do, use the pass command:
Some text indicating your thoughts, such as how you should update the plan status, respond to inquiry, or seek for help. Then a json array of commands.
```json
[
{{
"command_name": str,
"args": {{"arg_name": arg_value, ...}}
}},
...
]
```
Notice: your output JSON data section must start with **```json [**
BROWSER_INSTRUCTION = """
4. Carefully choose to use or not use the browser tool to assist you in web tasks.
- When no click action is required, no need to use the browser tool to navigate to the webpage before scraping.
- If you need detail HTML content, write code to get it but not to use the browser tool.
- Make sure the command_name are certainly in Available Commands when you use the browser tool.
"""
TASK_TYPE_DESC = "\n".join([f"- **{tt.type_name}**: {tt.value.desc}" for tt in TaskType])
CODE_STATUS = """
**Code written**:
{code}
**Execution status**: {status}
**Execution result**: {result}
"""
BROWSER_INFO = """
Here are ordered web actions in the browser environment, note that you can not use the browser tool in the current environment.
{browser_actions}
The latest url is the one you should use to view the page. If view page has been done, directly use the variable and html content in executing result.
"""

View file

@ -5,7 +5,7 @@ When presented a current task, tackle the task using the available commands.
Pay close attention to new user message, review the conversation history, use RoleZero.reply_to_human to respond to new user requirement.
Note:
1. If you keeping encountering errors, unexpected situation, or you are not sure of proceeding, use RoleZero.ask_human to ask for help.
2. Carefully review your progress at the current task, if your actions so far has not fulfilled the task instruction, you should continue with current task. Otherwise, finish current task.
2. Carefully review your progress at the current task, if your actions so far has not fulfilled the task instruction, you should continue with current task. Otherwise, finish current task by Plan.finish_current_task explicitly.
3. Each time you finish a task, use RoleZero.reply_to_human to report your progress.
"""
@ -18,6 +18,9 @@ class Task(BaseModel):
task_type: str = ""
assignee: str = ""
# Available Task Types
{task_type_desc}
# Available Commands
{available_commands}
Special Command: Use {{"command_name": "end"}} to do nothing or indicate completion of all requirements and the end of actions.
@ -38,7 +41,7 @@ Pay close attention to the Example provided, you can reuse the example for your
You may use any of the available commands to create a plan or update the plan. You may output mutiple commands, they will be executed sequentially.
If you finish current task, you will automatically take the next task in the existing plan, use Plan.finish_task, DON'T append a new task.
# Your commands in a json array, in the following output format. If there is nothing to do, use the pass or end command:
# Your commands in a json array, in the following output format with correct command_name and args. If there is nothing to do, use the pass or end command:
Some text indicating your thoughts, such as how you should update the plan status, respond to inquiry, or seek for help. Then a json array of commands. You must output ONE and ONLY ONE json array. DON'T output multiple json arrays with thoughts between them.
```json
[

View file

@ -28,7 +28,10 @@ your code
```
"""
REFLECTION_SYSTEM_MSG = """You are an AI Python assistant. You will be given your previous implementation code of a task, runtime error results, and a hint to change the implementation appropriately. Write your full implementation."""
REFLECTION_SYSTEM_MSG = """
You are an AI Python assistant. You will be given your previous implementation code of a task, runtime error results, and a hint to change the implementation appropriately. Write your full implementation.
When occuring ModuleNotFoundError, always install the required package. And use Terminal tool if available.
"""
DEBUG_REFLECTION_EXAMPLE = '''
[previous impl]:

View file

@ -53,3 +53,9 @@ The current task is about converting image into webpage code. please note the fo
- Single-Step Code Generation: Execute the entire code generation process in a single step, encompassing HTML, CSS, and JavaScript. Avoid fragmenting the code generation into multiple separate steps to maintain consistency and simplify the development workflow.
- Save webpages: Be sure to use the save method provided.
"""
# Prompt for taking on "web_scraping" tasks
WEB_SCRAPING_PROMPT = """
- Remember to view and print the necessary HTML content in a separate task to understand the structure first before scraping data.
- Since the data required by user may not correspond directly to the actual HTML element names, you should thoroughly analyze the HTML structure and meanings of all elements in the executing result first. Ensure the `class_` in your code should derived from the actual HTML structure directly, not based on your knowledge. To ensure it, analyse the most suitable location of the 'class_' in the actual HTML content before code.
"""

View file

@ -1,151 +1,109 @@
from __future__ import annotations
import json
from typing import Literal
import re
from typing import List
from pydantic import model_validator
from pydantic import Field, model_validator
from metagpt.actions import Action
from metagpt.actions.di.execute_nb_code import ExecuteNbCode
from metagpt.actions.di.write_analysis_code import WriteAnalysisCode
from metagpt.logs import logger
from metagpt.prompts.di.data_analyst import CMD_PROMPT
from metagpt.prompts.di.role_zero import JSON_REPAIR_PROMPT
from metagpt.roles.di.data_interpreter import DataInterpreter
from metagpt.schema import Message, TaskResult
from metagpt.strategy.experience_retriever import KeywordExpRetriever
from metagpt.strategy.planner import Planner
from metagpt.strategy.thinking_command import (
Command,
prepare_command_prompt,
run_commands,
)
from metagpt.tools.tool_recommend import BM25ToolRecommender
from metagpt.utils.common import CodeParser
from metagpt.utils.report import ThoughtReporter
from metagpt.utils.repair_llm_raw_output import repair_llm_raw_output, RepairType
from metagpt.prompts.di.data_analyst import BROWSER_INSTRUCTION, TASK_TYPE_DESC, CODE_STATUS, BROWSER_INFO
from metagpt.prompts.di.role_zero import ROLE_INSTRUCTION
from metagpt.roles.di.role_zero import RoleZero
from metagpt.schema import TaskResult, Message
from metagpt.strategy.experience_retriever import ExpRetriever, KeywordExpRetriever
from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender
from metagpt.tools.tool_registry import register_tool
class DataAnalyst(DataInterpreter):
@register_tool(include_functions=["write_and_exec_code"])
class DataAnalyst(RoleZero):
name: str = "David"
profile: str = "DataAnalyst"
goal: str = "Take on any data-related tasks, such as data analysis, machine learning, deep learning, web browsing, web scraping, web searching, web deployment, terminal operation, git and github operation, etc."
react_mode: Literal["react"] = "react"
max_react_loop: int = 20 # used for react mode
task_result: TaskResult = None
available_commands: list[Command] = [
Command.APPEND_TASK,
Command.RESET_TASK,
Command.REPLACE_TASK,
Command.FINISH_CURRENT_TASK,
# Command.PUBLISH_MESSAGE,
Command.ASK_HUMAN,
Command.REPLY_TO_HUMAN,
# Command.PASS,
]
commands: list[dict] = [] # issued commands to be executed
user_requirement: str = ""
instruction: str = ROLE_INSTRUCTION + BROWSER_INSTRUCTION
task_type_desc: str = TASK_TYPE_DESC
tools: list[str] = ["Plan", "DataAnalyst", "RoleZero", "Browser"]
custom_tools: list[str] = ["machine learning", "web scraping", "Terminal"]
custom_tool_recommender: ToolRecommender = None
experience_retriever: ExpRetriever = KeywordExpRetriever()
use_reflection: bool = True
write_code: WriteAnalysisCode = Field(default_factory=WriteAnalysisCode, exclude=True)
execute_code: ExecuteNbCode = Field(default_factory=ExecuteNbCode, exclude=True)
@model_validator(mode="after")
def set_plan_and_tool(self) -> "DataInterpreter":
# We force using this parameter for DataAnalyst
assert self.react_mode == "react"
assert self.auto_run
assert self.use_plan
def set_custom_tool(self):
if self.custom_tools and not self.custom_tool_recommender:
self.custom_tool_recommender = BM25ToolRecommender(tools=self.custom_tools)
# Roughly the same part as DataInterpreter.set_plan_and_tool
self._set_react_mode(react_mode=self.react_mode, max_react_loop=self.max_react_loop, auto_run=self.auto_run)
if self.tools and not self.tool_recommender:
self.tool_recommender = BM25ToolRecommender(tools=self.tools)
self.set_actions([WriteAnalysisCode])
def _update_tool_execution(self):
self.tool_execution_map.update({
"DataAnalyst.write_and_exec_code": self.write_and_exec_code,
})
# HACK: Init Planner, control it through dynamic thinking; Consider formalizing as a react mode
self.planner = Planner(goal="", working_memory=self.rc.working_memory, auto_run=True)
async def parse_browser_actions(self, memory: List[Message]) -> List[Message]:
memory = await super().parse_browser_actions(memory)
browser_actions = []
for index, msg in enumerate(memory):
if msg.cause_by == "browser":
browser_url = re.search('URL: (.*?)\\n', msg.content).group(1)
pattern = re.compile(r"Command Browser\.(\w+) executed")
browser_actions.append({
'command': pattern.match(memory[index - 1].content).group(1),
'current url': browser_url
})
if browser_actions:
browser_actions = BROWSER_INFO.format(browser_actions=browser_actions)
self.rc.working_memory.add(Message(content=browser_actions, role="user", cause_by="browser"))
return memory
return self
async def write_and_exec_code(self):
"""Write a code block for current task and execute it in an interactive notebook environment."""
counter = 0
success = False
await self.execute_code.init_code()
async def _think(self) -> bool:
"""Useful in 'react' mode. Use LLM to decide whether and what to do next."""
self._set_state(0)
example = ""
if not self.planner.plan.goal:
self.user_requirement = self.get_memories()[-1].content
self.planner.plan.goal = self.user_requirement
example = KeywordExpRetriever().retrieve(self.user_requirement)
# plan info
plan_status = self.planner.get_plan_status()
plan_status = self.planner.plan.model_dump(include=["goal", "tasks"])
# for task in plan_status["tasks"]:
# task.pop("code")
# task.pop("result")
prompt = CMD_PROMPT.format(
plan_status=plan_status,
example=example,
available_commands=prepare_command_prompt(self.available_commands),
)
context = self.llm.format_msg(self.working_memory.get() + [Message(content=prompt, role="user")])
# print(*context, sep="\n" + "*" * 5 + "\n")
async with ThoughtReporter(enable_llm_stream=True):
rsp = await self.llm.aask(context)
# tool info
if self.custom_tool_recommender:
plan = self.planner.plan
fixed = ["Terminal"] if "Terminal" in self.custom_tools else None
tool_info = await self.custom_tool_recommender.get_recommended_tool_info(fixed=fixed, plan=plan)
else:
tool_info = ""
# 临时方案待role zero的版本完成可将本注释内的代码直接替换掉
# -------------开始---------------
try:
commands = CodeParser.parse_code(block=None, lang="json", text=rsp)
commands = json.loads(repair_llm_raw_output(output=commands, req_keys=[None], repair_type=RepairType.JSON))
except json.JSONDecodeError as e:
commands = await self.llm.aask(msg=JSON_REPAIR_PROMPT.format(json_data=rsp))
commands = json.loads(CodeParser.parse_code(block=None, lang="json", text=commands))
except Exception as e:
tb = traceback.format_exc()
print(tb)
while not success and counter < 3:
### write code ###
logger.info(f"ready to WriteAnalysisCode")
use_reflection = (counter > 0 and self.use_reflection) # only use reflection after the first trial
# 为了对LLM不按格式生成进行容错
if isinstance(commands, dict):
commands = commands["commands"] if "commands" in commands else [commands]
# -------------结束---------------
code = await self.write_code.run(
user_requirement=self.planner.plan.goal,
plan_status=plan_status,
tool_info=tool_info,
working_memory=self.rc.working_memory.get(),
use_reflection=use_reflection,
)
self.rc.working_memory.add(Message(content=code, role="assistant", cause_by=WriteAnalysisCode))
self.rc.working_memory.add(Message(content=rsp, role="assistant"))
await run_commands(self, commands, self.rc.working_memory)
return bool(self.rc.todo)
### execute code ###
result, success = await self.execute_code.run(code)
print(result)
async def _act(self) -> Message:
"""Useful in 'react' mode. Return a Message conforming to Role._act interface."""
logger.info(f"ready to take on task {self.planner.plan.current_task}")
self.rc.working_memory.add(Message(content=result, role="user", cause_by=ExecuteNbCode))
# TODO: Consider an appropriate location to insert task experience formally
experience = KeywordExpRetriever().retrieve(self.planner.plan.current_task.instruction, exp_type="task")
if experience and experience not in [msg.content for msg in self.rc.working_memory.get()]:
exp_msg = Message(content=experience, role="assistant")
self.rc.working_memory.add(exp_msg)
### process execution result ###
counter += 1
if success:
task_result = TaskResult(code=code, result=result, is_success=success)
self.planner.current_task.update_task_result(task_result)
code, result, is_success = await self._write_and_exec_code()
self.planner.plan.current_task.is_success = (
is_success # mark is_success, determine is_finished later in thinking
)
# FIXME: task result is always overwritten by the last act, whereas it can be made of of multiple acts
self.task_result = TaskResult(code=code, result=result, is_success=is_success)
return Message(content="Task completed", role="assistant", sent_from=self._setting, cause_by=WriteAnalysisCode)
async def _react(self) -> Message:
# NOTE: Diff 1: Each time landing here means observing news, set todo to allow news processing in _think
self._set_state(0)
actions_taken = 0
rsp = Message(content="No actions taken yet", cause_by=Action) # will be overwritten after Role _act
while actions_taken < self.rc.max_react_loop:
# NOTE: Diff 2: Keep observing within _react, news will go into memory, allowing adapting to new info
# add news from self._observe, the one called in self.run, consider removing when switching from working_memory to memory
self.working_memory.add_batch(self.rc.news)
await self._observe()
# add news from this self._observe, we need twice because _observe rewrites rc.news
self.working_memory.add_batch(self.rc.news)
# think
has_todo = await self._think()
if not has_todo:
break
# act
logger.debug(f"{self._setting}: {self.rc.state=}, will do {self.rc.todo}")
rsp = await self._act()
actions_taken += 1
return rsp # return output from the last action
status = 'Success' if success else 'Failed'
output = CODE_STATUS.format(code=code, status=status, result=result)
self.rc.working_memory.clear()
return output

View file

@ -41,6 +41,7 @@ class RoleZero(Role):
system_msg: list[str] = None # Use None to conform to the default value at llm.aask
cmd_prompt: str = CMD_PROMPT
instruction: str = ROLE_INSTRUCTION
task_type_desc: str = None
# React Mode
react_mode: Literal["react"] = "react"
@ -148,14 +149,10 @@ class RoleZero(Role):
example=example,
available_commands=tool_info,
instruction=self.instruction.strip(),
task_type_desc=self.task_type_desc,
)
memory = self.rc.memory.get(self.memory_k)
if not self.browser.is_empty_page:
pattern = re.compile(r"Command Browser\.(\w+) executed")
for index, msg in zip(range(len(memory), 0, -1), memory[::-1]):
if pattern.match(msg.content):
memory.insert(index, UserMessage(cause_by="browser", content=await self.browser.view()))
break
memory = await self.parse_browser_actions(memory)
context = self.llm.format_msg(memory + [UserMessage(content=prompt)])
# print(*context, sep="\n" + "*" * 5 + "\n")
async with ThoughtReporter(enable_llm_stream=True) as reporter:
@ -165,6 +162,15 @@ class RoleZero(Role):
return True
async def parse_browser_actions(self, memory: List[Message]) -> List[Message]:
if not self.browser.is_empty_page:
pattern = re.compile(r"Command Browser\.(\w+) executed")
for index, msg in zip(range(len(memory), 0, -1), memory[::-1]):
if pattern.match(msg.content):
memory.insert(index, UserMessage(cause_by="browser", content=await self.browser.view()))
break
return memory
async def _act(self) -> Message:
if self.use_fixed_sop:
return await super()._act()
@ -267,13 +273,14 @@ class RoleZero(Role):
async def _run_commands(self, commands) -> str:
outputs = []
for cmd in commands:
output = f"Command {cmd['command_name']} executed"
# handle special command first
if await self._run_special_command(cmd):
outputs.append(output)
continue
# run command as specified by tool_execute_map
if cmd["command_name"] in self.tool_execution_map:
tool_obj = self.tool_execution_map[cmd["command_name"]]
output = f"Command {cmd['command_name']} executed"
try:
if inspect.iscoroutinefunction(tool_obj):
tool_output = await tool_obj(**cmd["args"])

View file

@ -464,8 +464,8 @@ class Task(BaseModel):
self.is_finished = False
def update_task_result(self, task_result: TaskResult):
self.code = task_result.code
self.result = task_result.result
self.code = self.code + "\n" + task_result.code
self.result = self.result + "\n" + task_result.result
self.is_success = task_result.is_success
@ -669,10 +669,14 @@ class Plan(BaseModel):
"""
return [task for task in self.tasks if task.is_finished]
def append_task(self, task_id: str, dependent_task_ids: list[str], instruction: str, assignee: str):
def append_task(self, task_id: str, dependent_task_ids: list[str], instruction: str, assignee: str, task_type: str):
"""Append a new task with task_id (number) to the end of existing task sequences. If dependent_task_ids is not empty, the task will depend on the tasks with the ids in the list."""
new_task = Task(
task_id=task_id, dependent_task_ids=dependent_task_ids, instruction=instruction, assignee=assignee
task_id=task_id,
dependent_task_ids=dependent_task_ids,
instruction=instruction,
assignee=assignee,
task_type=task_type
)
return self._append_task(new_task)

View file

@ -629,6 +629,8 @@ class KeywordExpRetriever(ExpRetriever):
return DEPLOY_EXAMPLE
elif "issue" in context.lower():
return FIX_ISSUE_EXAMPLE
elif "https:" or "http:" in context.lower():
return WEB_SCRAPING_EXAMPLE
elif exp_type == "task":
if "diagnose" in context.lower():
return SEARCH_SYMBOL_EXAMPLE
@ -890,3 +892,122 @@ Explanation: to review the code, call ReviewAndRewriteCode.run.
]
```
"""
WEB_SCRAPING_EXAMPLE = """
## action 1
User Requirement: Scrap and list the restaurant names of first page by searching for the keyword `beef` on the website https://www.yelp.com/.
Explanation: The requirement is to scrape data from a website and extract information about restaurants. The process involves searching for restaurants with a specific keyword, retrieving and presenting the data in a structured format.
```json
[
{
"command_name": "Plan.append_task",
"args": {
"task_id": "1",
"dependent_task_ids": [],
"instruction": "Navigate to the yelp website.",
"assignee": "Browser"
}
},
{
"command_name": "Plan.append_task",
"args": {
"task_id": "2",
"dependent_task_ids": ["1"],
"instruction": "Search for restaurants with the keyword 'beef'.",
"assignee": "Browser"
}
},
{
"command_name": "Plan.append_task",
"args": {
"task_id": "3",
"dependent_task_ids": ["2"],
"instruction": "View the html content of the search result page before scrap data to understand the structure.",
"assignee": "DataAnalyst"
}
},
{
"command_name": "Plan.append_task",
"args": {
"task_id": "4",
"dependent_task_ids": ["3"],
"instruction": "Parse the html content to scrape the restaurant names and print it.",
"assignee": "DataAnalyst"
}
}
]
```
## action 2
Explanation: To search for restaurants, I will now go to the website https://www.yelp.com/ first.
Here is the command to navigate to the website:
```json
[
{
"command_name": "Browser.goto",
"args": {
"url": "https://www.yelp.com/"
}
}
]
```
## action 3
Explanation: Since the Browser has successfully navigated to the website, and I find that the element id of the search box is 53. I will finish the current task and then use the Browser tool to type the keyword `beef` in the search box and press enter.
Here is the command to finish the current task and type the keyword:
```json
[
{
"command_name": "Plan.finish_current_task",
"args": {}
},
{
"command_name": "Browser.type",
"args": {
"element_id": 53,
"content": "beef",
"press_enter_after": true
}
}
]
```
## action 4
Explanation: Since the Browser has successfully search the keyword `beef`, I will finish the current task and then write code to view the html content of the page.
Here is the command to finish the current task and view the html content:
```json
[
{
"command_name": "Plan.finish_current_task",
"args": {}
},
{
"command_name": "DataAnalyst.write_and_exec_code",
"args": {}
}
]
```
## action 5
Explanation: Since the DataAnalyst has successfully viewed the html content of the page, I will finish the current task and then write code to parse the html content and extract the restaurant names.
Here is the command to finish the current task and parse the html content:
```json
[
{
"command_name": "Plan.finish_current_task",
"args": {}
},
{
"command_name": "DataAnalyst.write_and_exec_code",
"args": {}
}
]
...
"""

View file

@ -40,8 +40,14 @@ PLAN_STATUS = """
## Current Task
{current_task}
## Finished Section of Current Task
### code
{current_task_code}
### execution result
{current_task_result}
## Task Guidance
Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Write code for the incomplete sections of 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
Specifically, {guidance}
"""
@ -173,6 +179,8 @@ class Planner(BaseModel):
code_written=code_written,
task_results=task_results,
current_task=self.current_task.instruction,
current_task_code=self.current_task.code if self.current_task.code else "",
current_task_result=self.current_task.result if self.current_task.result else "",
guidance=guidance,
)

View file

@ -8,7 +8,7 @@ from metagpt.prompts.task_type import (
FEATURE_ENGINEERING_PROMPT,
IMAGE2WEBPAGE_PROMPT,
MODEL_EVALUATE_PROMPT,
MODEL_TRAIN_PROMPT,
MODEL_TRAIN_PROMPT, WEB_SCRAPING_PROMPT,
)
@ -62,6 +62,7 @@ class TaskType(Enum):
WEBSCRAPING = TaskTypeDef(
name="web scraping",
desc="For scraping data from web pages.",
guidance=WEB_SCRAPING_PROMPT,
)
EMAIL_LOGIN = TaskTypeDef(
name="email login",

View file

@ -5,11 +5,11 @@
# @File : __init__.py
# @Desc :
from metagpt.tools.libs import (
# data_preprocess,
# feature_engineering,
data_preprocess,
feature_engineering,
sd_engine,
gpt_v_generator,
# web_scraping,
web_scraping,
# email_login,
terminal,
editor,
@ -20,11 +20,11 @@ from metagpt.tools.libs import (
from metagpt.tools.libs.env import get_env, set_get_env_entry, default_get_env, get_env_description
_ = (
# data_preprocess,
# feature_engineering,
data_preprocess,
feature_engineering,
sd_engine,
gpt_v_generator,
# web_scraping,
web_scraping,
# email_login,
terminal,
editor,

View file

@ -8,13 +8,15 @@ from metagpt.utils.parse_html import simplify_html
@register_tool(tags=["web scraping"])
async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> None:
"""view the HTML content of current page to understand the structure. When executed, the content will be printed out
async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bool = False) -> str:
"""view the HTML content of current page to understand the structure.
Args:
url (str): The URL of the web page to scrape.
requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements.
keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required
Returns:
str: The HTML content of the page.
"""
async with Browser() as browser:
await browser.goto(url)
@ -36,7 +38,7 @@ async def view_page_element_to_scrape(url: str, requirement: str, keep_links: bo
html = "\n".join(i.text for i in nodes)
mem_fs.rm_file(filename)
print(html)
return html
# async def get_elements_outerhtml(self, element_ids: list[int]):

View file

@ -104,11 +104,13 @@ class ToolRecommender(BaseModel):
return ranked_tools
async def get_recommended_tool_info(self, **kwargs) -> str:
async def get_recommended_tool_info(self, fixed: list[str] = None, **kwargs) -> str:
"""
Wrap recommended tools with their info in a string, which can be used directly in a prompt.
"""
recommended_tools = await self.recommend_tools(**kwargs)
if fixed:
recommended_tools.extend([self.tools[tool_name] for tool_name in fixed if tool_name in self.tools])
if not recommended_tools:
return ""
tool_schemas = {tool.name: tool.schemas for tool in recommended_tools}

View file

@ -41,7 +41,7 @@ class WebPage(BaseModel):
def get_slim_soup(self, keep_links: bool = False):
soup = _get_soup(self.html)
keep_attrs = ["class"]
keep_attrs = ["class", "id"]
if keep_links:
keep_attrs.append("href")

View file

@ -0,0 +1,54 @@
from metagpt.roles.di.data_analyst import DataAnalyst
HOUSE_PRICE_TRAIN_PATH = '/data/house-prices-advanced-regression-techniques/split_train.csv'
HOUSE_PRICE_EVAL_PATH = '/data/house-prices-advanced-regression-techniques/split_eval.csv'
HOUSE_PRICE_REQ = f"""
This is a house price dataset, your goal is to predict the sale price of a property based on its features. The target column is SalePrice. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sales price on the eval data. Train data path: '{HOUSE_PRICE_TRAIN_PATH}', eval data path: '{HOUSE_PRICE_EVAL_PATH}'.
"""
CALIFORNIA_HOUSING_REQ = """
Analyze the 'Canifornia-housing-dataset' using https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html#sklearn.datasets.fetch_california_housing to predict the median house value. you need to perfrom data preprocessing, feature engineering and finally modeling to predict the target. Use machine learning techniques such as linear regression (including ridge regression and lasso regression), random forest, CatBoost, LightGBM, XGBoost or other appropriate method. You also need to report the MSE on the test dataset
"""
# For web scraping task, please provide url begin with `https://` or `http://`
PAPER_LIST_REQ = """"
Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
and save it to a csv file. paper title must include `multiagent` or `large language model`.
**Notice: view the page element before writing scraping code**
"""
ECOMMERCE_REQ = """
Get products data from website https://scrapeme.live/shop/ and save it as a csv file.
The first page product name, price, product URL, and image URL must be saved in the csv.
**Notice: view the page element before writing scraping code**
"""
NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, **注意: 这是一个中文网站**;
下面是一个大致流程, 你会根据每一步的运行结果对当前计划中的任务做出适当调整:
1. 爬取并本地保存html结构;
2. 直接打印第7个*`快讯`*关键词后2000个字符的html内容, 作为*快讯的html内容示例*;
3. 反思*快讯的html内容示例*中的规律, 设计正则匹配表达式来获取*`快讯`*的标题链接时间;
4. 筛选最近3天的初创企业融资*`快讯`*, 以list[dict]形式打印前5个
5. 将全部结果存在本地csv中
**Notice: view the page element before writing scraping code**
"""
WIKIPEDIA_SEARCH_REQ = """
Search for `LLM` on https://www.wikipedia.org/ and print all the meaningful significances of the entry.
"""
STACKOVERFLOW_CLICK_REQ = """
Click the Questions tag in https://stackoverflow.com/ and scrap question name, votes, answers and views num to csv in the first result page.
"""
async def main():
di = DataAnalyst()
await di.browser.start()
await di.run(STACKOVERFLOW_CLICK_REQ)
if __name__ == "__main__":
import asyncio
asyncio.run(main())