Merge branch 'dev_ldy' into 'dev'

Dev ldy

See merge request agents/data_agents_opt!13
This commit is contained in:
林义章 2023-12-01 07:57:54 +00:00
commit 20a918bf39
5 changed files with 165 additions and 21 deletions

View file

@ -4,10 +4,10 @@
@Author : orange-crow
@File : write_code_v2.py
"""
import json
from typing import Dict, List, Union
from typing import Dict, List, Union, Tuple
from metagpt.actions import Action
from metagpt.logs import logger
from metagpt.prompts.ml_engineer import (
TOOL_RECOMMENDATION_PROMPT,
SELECT_FUNCTION_TOOLS,
@ -99,24 +99,31 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
"""Write code with help of local available tools. Choose tools first, then generate code to use the tools"""
@staticmethod
def _parse_recommend_tools(module: str, recommend_tools: list) -> str:
def _parse_recommend_tools(module: str, recommend_tools: list) -> Tuple[Dict, List[Dict]]:
"""
Converts recommended tools to a JSON string and checks tool availability in the registry.
Parses and validates a list of recommended tools, and retrieves their schema from registry.
Args:
module (str): The module name for querying tools in the registry.
recommend_tools (list): A list of lists of recommended tools for each step.
Returns:
str: A JSON string with available tools and their schemas for each step.
Tuple[Dict, List[Dict]]:
- valid_tools: A dict of lists of valid tools for each step.
- tool_catalog: A list of dicts of unique tool schemas.
"""
valid_tools = {}
available_tools = registry.get_all_by_module(module).keys()
for index, tools in enumerate(recommend_tools):
key = f"Step {index + 1}"
tools = [tool for tool in tools if tool in available_tools]
valid_tools[key] = registry.get_schemas(module, tools)
return json.dumps(valid_tools)
valid_tools[key] = tools
unique_tools = set()
for tools in valid_tools.values():
unique_tools.update(tools)
tool_catalog = registry.get_schemas(module, unique_tools)
return valid_tools, tool_catalog
async def _tool_recommendation(
self, task: str, data_desc: str, code_steps: str, available_tools: list
@ -165,7 +172,8 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
recommend_tools = await self._tool_recommendation(
task, task_guide, available_tools
)
recommend_tools = self._parse_recommend_tools(task_type, recommend_tools)
recommend_tools, tool_catalog = self._parse_recommend_tools(task_type, recommend_tools)
logger.info(f"Recommended tools for every steps: {recommend_tools}")
special_prompt = ML_SPECIFIC_PROMPT.get(task_type, "")
module_name = ML_MODULE_MAP[task_type]
@ -190,6 +198,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
module_name=module_name,
output_desc=output_desc,
available_tools=recommend_tools,
tool_catalog=tool_catalog,
)
tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS)
rsp = await self.llm.aask_code(prompt, **tool_config)

View file

@ -4,12 +4,14 @@
@Author : orange-crow
@File : plan.py
"""
from typing import List
from typing import List, Dict
import json
from metagpt.actions import Action
from metagpt.prompts.ml_engineer import ASSIGN_TASK_TYPE_PROMPT, ASSIGN_TASK_TYPE
from metagpt.schema import Message, Task
from metagpt.utils.common import CodeParser
from metagpt.utils.common import CodeParser, create_func_config
class WritePlan(Action):
PROMPT_TEMPLATE = """
@ -30,7 +32,30 @@ class WritePlan(Action):
]
```
"""
async def run(self, context: List[Message], max_tasks: int = 5) -> str:
async def assign_task_type(self, tasks: List[Dict]) -> str:
"""Assign task type to each task in tasks
Args:
tasks (List[Dict]): tasks to be assigned task type
Returns:
List[Dict]: tasks with task type assigned
"""
task_list = "\n".join(
[f"Task {task['task_id']}: {task['instruction']}" for task in tasks]
)
prompt = ASSIGN_TASK_TYPE_PROMPT.format(task_list=task_list)
tool_config = create_func_config(ASSIGN_TASK_TYPE)
rsp = await self.llm.aask_code(prompt, **tool_config)
task_type_list = rsp["task_type"]
for task, task_type in zip(tasks, task_type_list):
task["task_type"] = task_type
return json.dumps(tasks)
async def run(
self, context: List[Message], max_tasks: int = 5, use_tools: bool = False
) -> str:
prompt = (
self.PROMPT_TEMPLATE.replace("__context__", "\n".join([str(ct) for ct in context]))
# .replace("__current_plan__", current_plan)
@ -38,6 +63,8 @@ class WritePlan(Action):
)
rsp = await self._aask(prompt)
rsp = CodeParser.parse_code(block=None, text=rsp)
if use_tools:
rsp = await self.assign_task_type(json.loads(rsp))
return rsp
@staticmethod

View file

@ -4,6 +4,35 @@
# @Author : lidanyang
# @File : ml_engineer
# @Desc :
ASSIGN_TASK_TYPE_PROMPT = """
## All Task Type:
- **data_preprocess**: Only involve cleaning and preparing data through techniques like imputation, scaling, and encoding, not containing reading data, feature engineering, model training, etc.
- **feature_engineering**: Involves enhancing data features through techniques like encoding, aggregation, time component analysis, and creating polynomial and interaction features, etc.
- **other**: Any tasks that do not fit into the previous categories, such as visualization, summarizing findings, build model, etc.
Please assign a task type to each task in the list below from the given categories:
{task_list}
"""
ASSIGN_TASK_TYPE = {
"name": "assign_task_type",
"description": "assign task type to each task by order",
"parameters": {
"type": "object",
"properties": {
"task_type": {
"type": "array",
"description": "List of task type.",
"items": {
"type": "string",
},
},
},
"required": ["task_type"],
},
}
TOOL_RECOMMENDATION_PROMPT = """
## Comprehensive Task Description:
{task}
@ -95,9 +124,13 @@ from metagpt.tools.functions.libs.feature_engineering import fill_missing_value
```
## Available Functions for Each Step:
Each function is described in JSON format, including the function name and parameters. {output_desc}
Here's a list of all available functions for each step. You can find more details about each function in [## Function Catalog]
{available_tools}
## Function Catalog:
Each function is described in JSON format, including the function name and parameters. {output_desc}
{function_catalog}
## Your Output Format:
Generate the complete code for every step, listing any used function tools at the beginning of the step:
```python
@ -133,11 +166,12 @@ When performing feature engineering, please adhere to the following principles:
- Importantly, provide detailed comments explaining the purpose of each feature and how it might enhance model performance, especially when the features are generated based on semantic understanding without clear user directives.
"""
CLASSIFICATION_MODEL_PROMPT = """
MODEL_TRAIN_PROMPT = """
When selecting and training a model, please follow these guidelines to ensure optimal performance:
- Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as lightGBM, XGBoost, CatBoost, etc.
If user specifies a model, use that model. Otherwise, use the model you believe will best solve the problem.
"""
REGRESSION_MODEL_PROMPT = """
"""
DATA_PREPROCESS_OUTPUT_DESC = "Please note that all functions uniformly output a processed pandas.DataFrame, facilitating seamless integration into the broader workflow."
@ -151,8 +185,8 @@ REGRESSION_MODEL_OUTPUT_DESC = ""
ML_SPECIFIC_PROMPT = {
"data_preprocess": DATA_PREPROCESS_PROMPT,
"feature_engineering": FEATURE_ENGINEERING_PROMPT,
"classification_model": CLASSIFICATION_MODEL_PROMPT,
"regression_model": REGRESSION_MODEL_PROMPT,
"classification_model": MODEL_TRAIN_PROMPT,
"regression_model": MODEL_TRAIN_PROMPT,
}
TOOL_OUTPUT_DESC = {

View file

@ -125,7 +125,7 @@ class MLEngineer(Role):
# print("*" * 10)
# breakpoint()
if not self.use_tools or self.plan.current_task.task_type == "":
if not self.use_tools or self.plan.current_task.task_type == "other":
# code = "print('abc')"
code = await WriteCodeByGenerate().run(
context=context, plan=self.plan, task_guide=task_guide, temperature=0.0
@ -171,7 +171,9 @@ class MLEngineer(Role):
plan_confirmed = False
while not plan_confirmed:
context = self.get_useful_memories()
rsp = await WritePlan().run(context, max_tasks=max_tasks)
rsp = await WritePlan().run(
context, max_tasks=max_tasks, use_tools=self.use_tools
)
self.working_memory.add(
Message(content=rsp, role="assistant", cause_by=WritePlan)
)

View file

@ -1,11 +1,12 @@
import asyncio
import pytest
from metagpt.actions.write_analysis_code import WriteCodeByGenerate
from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
from metagpt.actions.execute_code import ExecutePyCode
from metagpt.schema import Message
from metagpt.schema import Message, Plan, Task
from metagpt.logs import logger
@pytest.mark.asyncio
async def test_write_code_by_list_plan():
write_code = WriteCodeByGenerate()
@ -22,6 +23,77 @@ async def test_write_code_by_list_plan():
print(f"\n[Output]: 任务{task}的执行结果是: \n{output}\n")
messages.append(output[0])
@pytest.mark.asyncio
async def test_tool_recommendation():
task = "对已经读取的数据集进行数据清洗"
code_steps = """
step 1: 对数据集进行去重
step 2: 对数据集进行缺失值处理
"""
available_tools = [
{
"name": "fill_missing_value",
"description": "Completing missing values with simple strategies",
},
{
"name": "split_bins",
"description": "Bin continuous data into intervals and return the bin identifier encoded as an integer value",
},
]
write_code = WriteCodeWithTools()
tools = await write_code._tool_recommendation(task, code_steps, available_tools)
assert len(tools) == 2
assert tools[0] == []
assert tools[1] == ["fill_missing_value"]
@pytest.mark.asyncio
async def test_write_code_with_tools():
write_code = WriteCodeWithTools()
messages = []
task_map = {
"1": Task(
task_id="1",
instruction="随机生成一个pandas DataFrame数据集",
task_type="unknown",
dependent_task_ids=[],
code="""
import pandas as pd
df = pd.DataFrame({
'a': [1, 2, 3, 4, 5],
'b': [1.1, 2.2, 3.3, 4.4, np.nan],
'c': ['aa', 'bb', 'cc', 'dd', 'ee'],
'd': [1, 2, 3, 4, 5]
})
""",
is_finished=True,
),
"2": Task(
task_id="2",
instruction="对数据集进行数据清洗",
task_type="data_preprocess",
dependent_task_ids=["1"],
),
}
plan = Plan(
goal="构造数据集并进行数据清洗",
tasks=list(task_map.values()),
task_map=task_map,
current_task_id="2",
)
task_guide = """
step 1: 对数据集进行去重
step 2: 对数据集进行缺失值处理
"""
data_desc = "None"
code = await write_code.run(messages, plan, task_guide, data_desc)
assert len(code) > 0
print(code)
@pytest.mark.asyncio
async def test_write_code_to_correct_error():