Merge branch 'dev_tool_selection' of https://gitlab.deepwisdomai.com/agents/data_agents_opt into dev_tool_selection

update: 简化prompt,加入历史代码片段和执行结果
This commit is contained in:
stellahsr 2023-12-07 19:07:03 +08:00
commit 2d106203ff
9 changed files with 658 additions and 64 deletions

View file

@ -16,10 +16,14 @@ from metagpt.prompts.ml_engineer import (
ML_SPECIFIC_PROMPT,
ML_MODULE_MAP,
TOOL_OUTPUT_DESC,
TOOL_USAGE_PROMPT,
)
from metagpt.schema import Message, Plan
from metagpt.tools.functions import registry
from metagpt.utils.common import create_func_config
from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT, GENERATE_CODE_PROMPT
from metagpt.utils.common import CodeParser
from metagpt.actions.execute_code import ExecutePyCode
class BaseWriteAnalysisCode(Action):
@ -47,13 +51,13 @@ class BaseWriteAnalysisCode(Action):
# 添加默认的提示词
if (
default_system_msg not in messages[0]["content"]
and messages[0]["role"] != "system"
default_system_msg not in messages[0]["content"]
and messages[0]["role"] != "system"
):
messages.insert(0, {"role": "system", "content": default_system_msg})
elif (
default_system_msg not in messages[0]["content"]
and messages[0]["role"] == "system"
default_system_msg not in messages[0]["content"]
and messages[0]["role"] == "system"
):
messages[0] = {
"role": "system",
@ -62,14 +66,14 @@ class BaseWriteAnalysisCode(Action):
return messages
async def run(
self, context: List[Message], plan: Plan = None, task_guide: str = ""
self, context: List[Message], plan: Plan = None, code_steps: str = ""
) -> str:
"""Run of a code writing action, used in data analysis or modeling
Args:
context (List[Message]): Action output history, source action denoted by Message.cause_by
plan (Plan, optional): Overall plan. Defaults to None.
task_guide (str, optional): suggested step breakdown for the current task. Defaults to "".
code_steps (str, optional): suggested step breakdown for the current task. Defaults to "".
Returns:
str: The code string.
@ -83,12 +87,12 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):
super().__init__(name, context, llm)
async def run(
self,
context: [List[Message]],
plan: Plan = None,
task_guide: str = "",
system_msg: str = None,
**kwargs,
self,
context: [List[Message]],
plan: Plan = None,
code_steps: str = "",
system_msg: str = None,
**kwargs,
) -> str:
context.append(Message(content=self.REUSE_CODE_INSTRUCTION, role="user"))
prompt = self.process_msg(context, system_msg)
@ -98,6 +102,7 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):
class WriteCodeWithTools(BaseWriteAnalysisCode):
"""Write code with help of local available tools. Choose tools first, then generate code to use the tools"""
execute_code = ExecutePyCode()
@staticmethod
def _parse_recommend_tools(module: str, recommend_tools: list) -> List[Dict]:
@ -121,10 +126,10 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
return tool_catalog
async def _tool_recommendation(
self,
context: [List[Message]],
code_steps: str,
available_tools: list
self,
context: [List[Message]],
code_steps: str,
available_tools: list
) -> list:
"""
Recommend tools for the specified task.
@ -148,15 +153,28 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
recommend_tools = rsp["recommend_tools"]
return recommend_tools
async def run(
self,
context: List[Message],
plan: Plan = None,
task_guide: str = "",
self,
context: List[Message],
plan: Plan = None,
code_steps: str = "",
**kwargs,
) -> str:
task_type = plan.current_task.task_type
logger.info(f"task_type is: {task_type}")
available_tools = registry.get_all_schema_by_module(task_type)
special_prompt = ML_SPECIFIC_PROMPT.get(task_type, "")
# special_prompt = ML_SPECIFIC_PROMPT.get(task_type, "")
finished_tasks = plan.get_finished_tasks()
code_context = [task.code for task in finished_tasks]
code_context = "\n\n".join(code_context)
### add runtime info
result, success = await self.execute_code.run(code_context)
logger.info(result)
if len(available_tools) > 0:
available_tools = [
@ -164,25 +182,46 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
for tool in available_tools
]
recommend_tools = await self._tool_recommendation(context, task_guide, available_tools)
final_code = code_context
recommend_tools = await self._tool_recommendation(context, code_steps, available_tools)
tool_catalog = self._parse_recommend_tools(task_type, recommend_tools)
logger.info(f"Recommended tools: \n{recommend_tools}")
module_name = ML_MODULE_MAP[task_type]
output_desc = TOOL_OUTPUT_DESC.get(task_type, "")
prompt = TOO_ORGANIZATION_PROMPT.format(
special_prompt=special_prompt,
code_steps=task_guide,
hist_info = f"Previous finished code is \n\n ```Python {final_code} ``` \n\n " \
f"Conde runtime result is {result} \n\n"
prompt = TOOL_USAGE_PROMPT.format(
goal=plan.current_task.instruction,
context=hist_info,
code_steps=code_steps,
module_name=module_name,
output_desc=output_desc,
function_catalog=tool_catalog,
)
context.append(Message(content=prompt, role="user"))
else:
context.append(Message(content=self.REUSE_CODE_INSTRUCTION, role="user"))
context.append(Message(content=special_prompt, role="user"))
prompt = self.process_msg(context)
tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS)
rsp = await self.llm.aask_code(prompt, **tool_config)
return rsp["code"]
tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS)
rsp = await self.llm.aask_code(prompt, **tool_config)
logger.info(f"rsp is: {rsp}")
final_code = final_code + "\n\n" + rsp["code"]
return final_code
else:
hist_info = f"Previous finished code is \n\n ```Python {code_context} ``` \n\n " \
f"Conde runtime result is {result} \n\n"
prompt = GENERATE_CODE_PROMPT.format(
goal=plan.current_task.instruction,
context=hist_info,
)
tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS)
logger.info(f"prompt is: {prompt}")
rsp = await self.llm.aask_code(prompt, **tool_config)
logger.info(f"rsp is: {rsp}")
return rsp["code"]

View file

@ -0,0 +1,80 @@
import json
from typing import Dict, List, Union
from metagpt.actions import Action
from metagpt.schema import Message, Task, Plan
from metagpt.utils.common import CodeParser
CODE_STEPS_PROMPT_TEMPLATE = """
# Context
{context}
-----
Tasks are all code development tasks.
You are a professional engineer, the main goal is to plan out concise solution steps for Current Task before coding.
A planning process can reduce the difficulty and improve the quality of coding.
You may be given some code plans for the tasks ahead, but you don't have to follow the existing plan when planning the current task.
The output plan should following the subsequent principles:
1.The plan is a rough checklist of steps outlining the entire program's structure.Try to keep the number of steps fewer than 5.
2.The steps should be written concisely and at a high level, avoiding overly detailed implementation specifics.
3.The execution of the plan happens sequentially, but the plan can incorporate conditional (if) and looping(loop) keywords for more complex structures.
Output the code steps in a JSON format, as shown in this example:
```json
{
"Step 1": "",
"Step 2": "",
"Step 3": "",
...
}
```
"""
STRUCTURAL_CONTEXT = """
## User Requirement
{user_requirement}
## Current Plan
{tasks}
## Current Task
{current_task}
"""
class WriteCodeSteps(Action):
async def run(self, plan: Plan) -> str:
"""Run of a task guide writing action, used in ml engineer
Args:
plan (plan): task plan
useful_memories (list): useful_memories
Returns:
str: The dataset_descriptions string.
"""
context = self.get_context(plan)
code_steps_prompt = CODE_STEPS_PROMPT_TEMPLATE.replace(
"{context}", context
)
code_steps = await self._aask(code_steps_prompt)
code_steps = CodeParser.parse_code(block=None, text=code_steps)
return code_steps
def get_context(self, plan: Plan):
user_requirement = plan.goal
select_task_keys = ['task_id', 'instruction', 'is_finished', 'code_steps']
def process_task(task):
task_dict = task.dict()
ptask = {k: task_dict[k] for k in task_dict if k in select_task_keys}
return ptask
tasks = json.dumps(
[process_task(task) for task in plan.tasks], indent=4, ensure_ascii=False
)
current_task = json.dumps(process_task(plan.current_task)) if plan.current_task else {}
context = STRUCTURAL_CONTEXT.format(
user_requirement=user_requirement, tasks=tasks, current_task=current_task
)
# print(context)
return context

View file

@ -23,7 +23,6 @@ Output the information in a JSON format, as shown in this example:
```
"""
ASSIGN_TASK_TYPE_PROMPT = """
Please assign a task type to each task in the list below from the given categories:
{task_list}
@ -53,7 +52,6 @@ ASSIGN_TASK_TYPE = {
},
}
TOOL_RECOMMENDATION_PROMPT = """
Your are a tool recommender, the main goal is to recommend suitable tools for current task before coding. A tool means a function that can be used to help you solve the task.
@ -88,7 +86,6 @@ SELECT_FUNCTION_TOOLS = {
},
}
CODE_GENERATOR_WITH_TOOLS = {
"name": "add_subtask_code",
"description": "Add new code cell of current task to the end of an active Jupyter notebook.",
@ -104,6 +101,54 @@ CODE_GENERATOR_WITH_TOOLS = {
},
}
TOOL_USAGE_PROMPT = """
## Target
{goal}
## History Info
{context}
## Available Tools:
Each function is described in JSON format, including the function name and parameters. {output_desc}
{function_catalog}
When you call a function above, you should import the function from `{module_name}` first, e.g.:
```python
from metagpt.tools.functions.libs.data_preprocess import fill_missing_value
```end
## Your Output Format:
Generate the complete code for this task:
```python
# Tools used: [function names or 'none']
<your code for the current task, without any comments>
```end
## Attention:
Make sure use the columns from the dataset columns
Finish your coding tasks as a helpful programmer based on the tools.
"""
GENERATE_CODE_PROMPT = """
## Target
{goal}
## History Info
{context}
## Your Output Format:
Generate the complete code for this task:
```python
# Tools used: [function names or 'none']
<your code for the current task>
```end
## Attention:
Make sure use the columns from the dataset columns
Finish your coding tasks as a helpful programmer based on the tools.
"""
TOO_ORGANIZATION_PROMPT = """
The previous conversation has provided all tasks step-by-step for the use goal and their statuses.
Now, begin writing code for the current task. This code should writen strictly on the basis of all previous completed tasks code, not a standalone code. And avoid writing duplicate code that has already been written in previous tasks, such as repeated import of packages, reading data, etc.
@ -167,7 +212,6 @@ CLASSIFICATION_MODEL_OUTPUT_DESC = ""
REGRESSION_MODEL_OUTPUT_DESC = ""
ML_SPECIFIC_PROMPT = {
"data_preprocess": DATA_PREPROCESS_PROMPT,
"feature_engineering": FEATURE_ENGINEERING_PROMPT,

View file

@ -10,12 +10,12 @@ from metagpt.actions import Action
from metagpt.actions.execute_code import ExecutePyCode
from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
from metagpt.actions.write_plan import WritePlan
# from metagpt.actions.write_task_guide import WriteTaskGuide
from metagpt.logs import logger
from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT
from metagpt.roles import Role
from metagpt.schema import Message, Plan
from metagpt.utils.common import CodeParser
from metagpt.actions.write_code_steps import WriteCodeSteps
STRUCTURAL_CONTEXT = """
## User Requirement
@ -39,7 +39,7 @@ catboost
def truncate(result: str, keep_len: int = 1000) -> str:
desc = "Truncated to show only the last 1000 characters\n"
if result.startswith(desc):
result = result[-len(desc):]
result = result[-len(desc) :]
if len(result) > keep_len:
result = result[-keep_len:]
@ -110,9 +110,9 @@ class AskReview(Action):
logger.info("most recent context:")
latest_action = context[-1].cause_by.__name__ if context[-1].cause_by else ""
prompt = f"\nPlease review output from {latest_action}:\n" \
"If you want to change a task in the plan, say 'change task task_id, ... (things to change)'\n" \
"If you confirm the output and wish to continue with the current process, type CONFIRM\n" \
"If you want to terminate the process, type exit:\n"
"If you want to change a task in the plan, say 'change task task_id, ... (things to change)'\n" \
"If you confirm the output and wish to continue with the current process, type CONFIRM\n" \
"If you want to terminate the process, type exit:\n"
rsp = input(prompt)
if rsp.lower() in ("exit"):
@ -123,11 +123,6 @@ class AskReview(Action):
return rsp, confirmed
# class WriteTaskGuide(Action):
# async def run(self, task_instruction: str, data_desc: dict = None) -> str:
# return ""
class GenerateDataDesc(Action):
async def run(self, files: list) -> dict:
data_desc = {}
@ -148,13 +143,13 @@ class GenerateDataDesc(Action):
class MLEngineer(Role):
def __init__(
self, name="ABC", profile="MLEngineer", goal="", auto_run: bool = False, data_path: str = None
self, name="ABC", profile="MLEngineer", goal="", auto_run: bool = False, data_path: str = None
):
super().__init__(name=name, profile=profile, goal=goal)
self._set_react_mode(react_mode="plan_and_act")
self.plan = Plan(goal=goal)
self.use_tools = True
self.use_task_guide = True
self.use_code_steps = True
self.execute_code = ExecutePyCode()
self.auto_run = auto_run
self.data_path = data_path
@ -164,6 +159,7 @@ class MLEngineer(Role):
if self.data_path:
self.data_desc = await self._generate_data_desc()
# create initial plan and update until confirmation
await self._update_plan()
@ -172,7 +168,7 @@ class MLEngineer(Role):
logger.info(f"ready to take on task {task}")
# take on current task
code, result, success = await self._write_and_exec_code()
code, result, success, code_steps = await self._write_and_exec_code()
# ask for acceptance, users can other refuse and change tasks in the plan
task_result_confirmed = await self._ask_review()
@ -181,6 +177,7 @@ class MLEngineer(Role):
# tick off this task and record progress
task.code = code
task.result = result
task.code_steps = code_steps
self.plan.finish_current_task()
self.working_memory.clear()
@ -194,9 +191,9 @@ class MLEngineer(Role):
return data_desc
async def _write_and_exec_code(self, max_retry: int = 3):
task_guide = (
await WriteTaskGuide().run(self.plan)
if self.use_task_guide
code_steps = (
await WriteCodeSteps().run(self.plan)
if self.use_code_steps
else ""
)
@ -204,23 +201,22 @@ class MLEngineer(Role):
success = False
while not success and counter < max_retry:
context = self.get_useful_memories()
# print("*" * 10)
# print(context)
# print("*" * 10)
# breakpoint()
column_names_dict = {key: value["column_info"] for key,value in self.data_desc.items()}
if not self.use_tools or self.plan.current_task.task_type == "other":
logger.info("Write code with pure generation")
# code = "print('abc')"
code = await WriteCodeByGenerate().run(
context=context, plan=self.plan, task_guide=task_guide, temperature=0.0
context=context, plan=self.plan, code_steps=code_steps, temperature=0.0
)
cause_by = WriteCodeByGenerate
else:
logger.info("Write code with tools")
code = await WriteCodeWithTools().run(
context=context, plan=self.plan, task_guide=task_guide
context=context, plan=self.plan, code_steps=code_steps, **{"column_names": column_names_dict}
)
cause_by = WriteCodeWithTools
@ -243,7 +239,7 @@ class MLEngineer(Role):
counter += 1
return code, result, success
return code, result, success, code_steps
async def _ask_review(self):
if not self.auto_run:
@ -272,7 +268,7 @@ class MLEngineer(Role):
def get_useful_memories(self) -> List[Message]:
"""find useful memories only to reduce context length and improve performance"""
# TODO dataset description , code steps
user_requirement = self.plan.goal
tasks = json.dumps(
[task.dict() for task in self.plan.tasks], indent=4, ensure_ascii=False
@ -294,7 +290,7 @@ class MLEngineer(Role):
if __name__ == "__main__":
# requirement = "Run data analysis on sklearn Iris dataset, include a plot.."
# requirement = "Run data analysis on sklearn Iris dataset, include a plot"
# requirement = "Run data analysis on sklearn Diabetes dataset, include a plot"
# requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy"
# requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
@ -305,10 +301,8 @@ if __name__ == "__main__":
requirement = "Perform data analysis on the provided data. Train a model to predict the target variable Survived. Include data preprocessing, feature engineering, and modeling in your pipeline. The metric is accuracy."
data_path = f"{DATA_PATH}/titanic"
async def main(requirement: str = requirement, auto_run: bool = True, data_path: str = data_path):
role = MLEngineer(goal=requirement, auto_run=auto_run, data_path=data_path)
await role.run(requirement)
fire.Fire(main)

View file

@ -81,6 +81,7 @@ class Task(BaseModel):
code: str = ""
result: str = ""
is_finished: bool = False
code_steps: str = ""
class Plan(BaseModel):

View file

@ -0,0 +1,123 @@
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.data_preprocess import *
@registry.register("data_preprocess", FillMissingValue)
def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
return df
# @registry.register("data_preprocess", FillMissingValue)
# def label_encode(df: pd.DataFrame, features: list,):
# for col in features:
# df[col] = LabelEncoder().fit_transform(df[col])
# return df
@registry.register("data_preprocess", SplitBins)
def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
return df
@registry.register("data_preprocess", MinMaxScale)
def min_max_scale(df: pd.DataFrame, features: list, ):
df[features] = MinMaxScaler().fit_transform(df[features])
return df
@registry.register("data_preprocess", StandardScale)
def standard_scale(df: pd.DataFrame, features: list, ):
df[features] = StandardScaler().fit_transform(df[features])
return df
@registry.register("data_preprocess", LogTransform)
def log_transform(df: pd.DataFrame, features: list, ):
for col in features:
if df[col].min() <= 0:
df[col] = df[col] - df[col].min() + 2
df[col] = np.log(df[col])
return df
@registry.register("data_preprocess", MaxAbsScale)
def max_abs_scale(df: pd.DataFrame, features: list, ):
df[features] = MaxAbsScaler().fit_transform(df[features])
return df
@registry.register("data_preprocess", RobustScale)
def robust_scale(df: pd.DataFrame, features: list, ):
df[features] = RobustScaler().fit_transform(df[features])
return df
@registry.register("data_preprocess", OrdinalEncode)
def ordinal_encode(df: pd.DataFrame, features: list,):
df[features] = OrdinalEncoder().fit_transform(df[features])
return df
if __name__ == '__main__':
def run():
V = {
'a': [-1, 2, 3, 6, 5, 4],
'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
'd': [1, None, 3, None, 5, 4],
'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
}
df = pd.DataFrame(V)
print(df.dtypes)
numeric_features = ['a', 'b', 'd', 'e']
numeric_features_wo_miss = ['a', 'b', ]
categorial_features = ['c', 'f']
df_ = fill_missing_value(df.copy(), numeric_features)
print(df_)
df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
print(df_)
df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
print(df_)
# df_ = label_encode(df.copy(), numeric_features + categorial_features, )
# print(df_)
df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
print(df_)
df_ = min_max_scale(df.copy(), numeric_features, )
print(df_)
df_ = standard_scale(df.copy(), numeric_features, )
print(df_)
df_ = log_transform(df.copy(), numeric_features, )
print(df_)
df_ = max_abs_scale(df.copy(), numeric_features, )
print(df_)
df_ = robust_scale(df.copy(), numeric_features, )
print(df_)
run()

View file

@ -0,0 +1,196 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.ml_model import *
#########
## 分类 ##
#########
@registry.register("classification_model", LogisticRegressionClassification)
def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = LogisticRegression(penalty=penalty, dual=dual)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("classification_model", RandomForestClassification)
def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("classification_model", GradientBoostingClassification)
def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
#########
## 回归 ##
#########
@registry.register("regression_model", LinearRegressionRegression)
def linear_regression(df, label, test_size=0.2, ):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = LinearRegression()
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("regression_model", RandomForestRegression)
def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("regression_model", GradientBoostingRegression)
def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
if __name__ == '__main__':
def run():
from sklearn.datasets import load_iris
loader = load_iris(as_frame=True)
df = loader['data']
df['target'] = loader['target']
df[df.columns[0]] = df[df.columns[0]].astype(str)
df[df.columns[1]] = df[df.columns[1]].astype(int)
df['target'] = df['target'].astype(str)
print(df)
print('####'*5)
res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False)
print(res['te_pred_prob'])
print('####'*5)
res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini')
print(res['te_pred_prob'])
print('####'*5)
res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
print(res['te_pred_prob'])
from sklearn.datasets import make_regression
import pandas as pd
loader = make_regression()
df = pd.DataFrame(loader[0])
df['target'] = loader[1]
df[df.columns[0]] = df[df.columns[0]].astype(str)
df[df.columns[1]] = df[df.columns[1]].astype(int)
# df['target'] = df['target'].astype(str)
print(df)
print('####' * 5)
res = linear_regression(df, 'target', test_size=0.25, )
print(res['te_pred_prob'])
print('####' * 5)
res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error')
print(res['te_pred_prob'])
print('####' * 5)
res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
print(res['te_pred_prob'])
run()

View file

@ -0,0 +1,62 @@
import pandas as pd
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
class FillMissingValue(ToolSchema):
"""Completing missing values with simple strategies"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(description="the imputation strategy", default='mean')
fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
# class LabelEncode(ToolSchema):
# """Completing missing values with simple strategies"""
# df: pd.DataFrame = tool_field(description="input dataframe")
# features: list = tool_field(description="columns to be processed")
class SplitBins(ToolSchema):
"""Bin continuous data into intervals and return the bin identifier encoded as an integer value"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile')
class MinMaxScale(ToolSchema):
"""Transform features by scaling each feature to a range, witch is (0, 1)"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class StandardScale(ToolSchema):
"""Standardize features by removing the mean and scaling to unit variance"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class LogTransform(ToolSchema):
"""Performs a logarithmic transformation on the specified columns"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class MaxAbsScale(ToolSchema):
"""Scale each feature by its maximum absolute value"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class RobustScale(ToolSchema):
"""Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class OrdinalEncode(ToolSchema):
"""Encode categorical features as an integer array"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")

View file

@ -0,0 +1,55 @@
import pandas as pd
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
class LogisticRegressionClassification(ToolSchema):
"""Logistic Regression (aka logit, MaxEnt) classifier"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
penalty: str = tool_field(description="Specify the norm of the penalty", default="l2")
dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2")
class RandomForestClassification(ToolSchema):
"""random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
criterion: str = tool_field(description="The function to measure the quality of a split", default="gini")
class GradientBoostingClassification(ToolSchema):
"""Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
class LinearRegressionRegression(ToolSchema):
"""Ordinary least squares Linear Regression."""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
class RandomForestRegression(ToolSchema):
"""random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error")
class GradientBoostingRegression(ToolSchema):
"""Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)