mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-05 13:52:38 +02:00
Merge dev to dev_tool_selection
This commit is contained in:
commit
56dd0ee882
8 changed files with 534 additions and 23 deletions
|
|
@ -62,14 +62,14 @@ class BaseWriteAnalysisCode(Action):
|
|||
return messages
|
||||
|
||||
async def run(
|
||||
self, context: List[Message], plan: Plan = None, task_guide: str = ""
|
||||
self, context: List[Message], plan: Plan = None, code_steps: str = ""
|
||||
) -> str:
|
||||
"""Run of a code writing action, used in data analysis or modeling
|
||||
|
||||
Args:
|
||||
context (List[Message]): Action output history, source action denoted by Message.cause_by
|
||||
plan (Plan, optional): Overall plan. Defaults to None.
|
||||
task_guide (str, optional): suggested step breakdown for the current task. Defaults to "".
|
||||
code_steps (str, optional): suggested step breakdown for the current task. Defaults to "".
|
||||
|
||||
Returns:
|
||||
str: The code string.
|
||||
|
|
@ -86,7 +86,7 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):
|
|||
self,
|
||||
context: [List[Message]],
|
||||
plan: Plan = None,
|
||||
task_guide: str = "",
|
||||
code_steps: str = "",
|
||||
system_msg: str = None,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
|
|
@ -152,7 +152,8 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
|
|||
self,
|
||||
context: List[Message],
|
||||
plan: Plan = None,
|
||||
task_guide: str = "",
|
||||
code_steps: str = "",
|
||||
data_desc: str = "",
|
||||
) -> str:
|
||||
task_type = plan.current_task.task_type
|
||||
available_tools = registry.get_all_schema_by_module(task_type)
|
||||
|
|
@ -164,7 +165,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
|
|||
for tool in available_tools
|
||||
]
|
||||
|
||||
recommend_tools = await self._tool_recommendation(context, task_guide, available_tools)
|
||||
recommend_tools = await self._tool_recommendation(context, code_steps, available_tools)
|
||||
tool_catalog = self._parse_recommend_tools(task_type, recommend_tools)
|
||||
logger.info(f"Recommended tools: \n{recommend_tools}")
|
||||
|
||||
|
|
@ -172,7 +173,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
|
|||
output_desc = TOOL_OUTPUT_DESC.get(task_type, "")
|
||||
prompt = TOO_ORGANIZATION_PROMPT.format(
|
||||
special_prompt=special_prompt,
|
||||
code_steps=task_guide,
|
||||
code_steps=code_steps,
|
||||
module_name=module_name,
|
||||
output_desc=output_desc,
|
||||
function_catalog=tool_catalog,
|
||||
|
|
|
|||
77
metagpt/actions/write_code_steps.py
Normal file
77
metagpt/actions/write_code_steps.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
|
||||
import json
|
||||
from typing import Dict, List, Union
|
||||
|
||||
from metagpt.actions import Action
|
||||
from metagpt.schema import Message, Task, Plan
|
||||
|
||||
|
||||
CODE_STEPS_PROMPT_TEMPLATE = """
|
||||
# Context
|
||||
{context}
|
||||
|
||||
## Format example
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
...
|
||||
|
||||
-----
|
||||
Tasks are all code development tasks.
|
||||
You are a professional engineer, the main goal is to plan out concise solution steps for Current Task before coding.
|
||||
A planning process can reduce the difficulty and improve the quality of coding.
|
||||
You may be given some code plans for the tasks ahead, but you don't have to follow the existing plan when planning the current task.
|
||||
The output plan should following the subsequent principles:
|
||||
1.The plan is a rough checklist of steps outlining the entire program's structure.Try to keep the number of steps fewer than 5.
|
||||
2.The steps should be written concisely and at a high level, avoiding overly detailed implementation specifics.
|
||||
3.The execution of the plan happens sequentially, but the plan can incorporate conditional (if) and looping(loop) keywords for more complex structures.
|
||||
4.Output carefully referenced "Format example" in format.
|
||||
"""
|
||||
|
||||
STRUCTURAL_CONTEXT = """
|
||||
## User Requirement
|
||||
{user_requirement}
|
||||
## Current Plan
|
||||
{tasks}
|
||||
## Current Task
|
||||
{current_task}
|
||||
"""
|
||||
|
||||
|
||||
class WriteCodeSteps(Action):
|
||||
|
||||
async def run(self, plan: Plan) -> str:
|
||||
"""Run of a task guide writing action, used in ml engineer
|
||||
|
||||
Args:
|
||||
plan (plan): task plan
|
||||
useful_memories (list): useful_memories
|
||||
Returns:
|
||||
str: The dataset_descriptions string.
|
||||
"""
|
||||
|
||||
context = self.get_context(plan)
|
||||
code_steps_prompt = CODE_STEPS_PROMPT_TEMPLATE.format(
|
||||
context=context,
|
||||
)
|
||||
code_steps = await self._aask(code_steps_prompt)
|
||||
return code_steps
|
||||
|
||||
def get_context(self, plan: Plan):
|
||||
user_requirement = plan.goal
|
||||
select_task_keys = ['task_id', 'instruction', 'is_finished', 'code_steps']
|
||||
|
||||
def process_task(task):
|
||||
task_dict = task.dict()
|
||||
ptask = {k: task_dict[k] for k in task_dict if k in select_task_keys}
|
||||
return ptask
|
||||
tasks = json.dumps(
|
||||
[process_task(task) for task in plan.tasks], indent=4, ensure_ascii=False
|
||||
)
|
||||
current_task = json.dumps(process_task(plan.current_task)) if plan.current_task else {}
|
||||
context = STRUCTURAL_CONTEXT.format(
|
||||
user_requirement=user_requirement, tasks=tasks, current_task=current_task
|
||||
)
|
||||
# print(context)
|
||||
return context
|
||||
|
||||
|
|
@ -10,12 +10,12 @@ from metagpt.actions import Action
|
|||
from metagpt.actions.execute_code import ExecutePyCode
|
||||
from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
|
||||
from metagpt.actions.write_plan import WritePlan
|
||||
from metagpt.actions.write_task_guide import WriteTaskGuide
|
||||
from metagpt.logs import logger
|
||||
from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT
|
||||
from metagpt.roles import Role
|
||||
from metagpt.schema import Message, Plan
|
||||
from metagpt.utils.common import CodeParser
|
||||
from metagpt.actions.write_code_steps import WriteCodeSteps
|
||||
|
||||
STRUCTURAL_CONTEXT = """
|
||||
## User Requirement
|
||||
|
|
@ -123,11 +123,6 @@ class AskReview(Action):
|
|||
return rsp, confirmed
|
||||
|
||||
|
||||
# class WriteTaskGuide(Action):
|
||||
# async def run(self, task_instruction: str, data_desc: dict = None) -> str:
|
||||
# return ""
|
||||
|
||||
|
||||
class GenerateDataDesc(Action):
|
||||
async def run(self, files: list) -> dict:
|
||||
data_desc = {}
|
||||
|
|
@ -154,7 +149,7 @@ class MLEngineer(Role):
|
|||
self._set_react_mode(react_mode="plan_and_act")
|
||||
self.plan = Plan(goal=goal)
|
||||
self.use_tools = True
|
||||
self.use_task_guide = True
|
||||
self.use_code_steps = True
|
||||
self.execute_code = ExecutePyCode()
|
||||
self.auto_run = auto_run
|
||||
self.data_path = data_path
|
||||
|
|
@ -172,7 +167,7 @@ class MLEngineer(Role):
|
|||
logger.info(f"ready to take on task {task}")
|
||||
|
||||
# take on current task
|
||||
code, result, success = await self._write_and_exec_code()
|
||||
code, result, success, code_steps = await self._write_and_exec_code()
|
||||
|
||||
# ask for acceptance, users can other refuse and change tasks in the plan
|
||||
task_result_confirmed = await self._ask_review()
|
||||
|
|
@ -181,6 +176,7 @@ class MLEngineer(Role):
|
|||
# tick off this task and record progress
|
||||
task.code = code
|
||||
task.result = result
|
||||
task.code_steps = code_steps
|
||||
self.plan.finish_current_task()
|
||||
self.working_memory.clear()
|
||||
|
||||
|
|
@ -194,9 +190,9 @@ class MLEngineer(Role):
|
|||
return data_desc
|
||||
|
||||
async def _write_and_exec_code(self, max_retry: int = 3):
|
||||
task_guide = (
|
||||
await WriteTaskGuide().run(self.plan)
|
||||
if self.use_task_guide
|
||||
code_steps = (
|
||||
await WriteCodeSteps().run(self.plan)
|
||||
if self.use_code_steps
|
||||
else ""
|
||||
)
|
||||
|
||||
|
|
@ -214,13 +210,13 @@ class MLEngineer(Role):
|
|||
logger.info("Write code with pure generation")
|
||||
# code = "print('abc')"
|
||||
code = await WriteCodeByGenerate().run(
|
||||
context=context, plan=self.plan, task_guide=task_guide, temperature=0.0
|
||||
context=context, plan=self.plan, code_steps=code_steps, temperature=0.0
|
||||
)
|
||||
cause_by = WriteCodeByGenerate
|
||||
else:
|
||||
logger.info("Write code with tools")
|
||||
code = await WriteCodeWithTools().run(
|
||||
context=context, plan=self.plan, task_guide=task_guide
|
||||
context=context, plan=self.plan, code_steps=code_steps,
|
||||
)
|
||||
cause_by = WriteCodeWithTools
|
||||
|
||||
|
|
@ -243,7 +239,7 @@ class MLEngineer(Role):
|
|||
|
||||
counter += 1
|
||||
|
||||
return code, result, success
|
||||
return code, result, success, code_steps
|
||||
|
||||
async def _ask_review(self):
|
||||
if not self.auto_run:
|
||||
|
|
@ -272,7 +268,7 @@ class MLEngineer(Role):
|
|||
|
||||
def get_useful_memories(self) -> List[Message]:
|
||||
"""find useful memories only to reduce context length and improve performance"""
|
||||
|
||||
# TODO dataset description , code steps
|
||||
user_requirement = self.plan.goal
|
||||
tasks = json.dumps(
|
||||
[task.dict() for task in self.plan.tasks], indent=4, ensure_ascii=False
|
||||
|
|
@ -294,11 +290,11 @@ class MLEngineer(Role):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# requirement = "Run data analysis on sklearn Iris dataset, include a plot.."
|
||||
# requirement = "Run data analysis on sklearn Iris dataset, include a plot"
|
||||
# requirement = "Run data analysis on sklearn Diabetes dataset, include a plot"
|
||||
# requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy"
|
||||
# requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
|
||||
# requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"
|
||||
requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"
|
||||
|
||||
requirement = "Perform data analysis on the provided data. Train a model to predict the target variable Survived. Include data preprocessing, feature engineering, and modeling in your pipeline. The metric is accuracy."
|
||||
data_path = "/data/lidanyang/tabular_data/titanic"
|
||||
|
|
|
|||
|
|
@ -81,6 +81,7 @@ class Task(BaseModel):
|
|||
code: str = ""
|
||||
result: str = ""
|
||||
is_finished: bool = False
|
||||
code_steps: str = ""
|
||||
|
||||
|
||||
class Plan(BaseModel):
|
||||
|
|
|
|||
123
metagpt/tools/functions/libs/data_preprocess.py
Normal file
123
metagpt/tools/functions/libs/data_preprocess.py
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.preprocessing import KBinsDiscretizer
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import MaxAbsScaler
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
from sklearn.preprocessing import OrdinalEncoder
|
||||
|
||||
from metagpt.tools.functions import registry
|
||||
from metagpt.tools.functions.schemas.data_preprocess import *
|
||||
|
||||
|
||||
@registry.register("data_preprocess", FillMissingValue)
|
||||
def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
|
||||
df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
|
||||
return df
|
||||
|
||||
|
||||
# @registry.register("data_preprocess", FillMissingValue)
|
||||
# def label_encode(df: pd.DataFrame, features: list,):
|
||||
# for col in features:
|
||||
# df[col] = LabelEncoder().fit_transform(df[col])
|
||||
# return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", SplitBins)
|
||||
def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
|
||||
df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", MinMaxScale)
|
||||
def min_max_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = MinMaxScaler().fit_transform(df[features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", StandardScale)
|
||||
def standard_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = StandardScaler().fit_transform(df[features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", LogTransform)
|
||||
def log_transform(df: pd.DataFrame, features: list, ):
|
||||
for col in features:
|
||||
if df[col].min() <= 0:
|
||||
df[col] = df[col] - df[col].min() + 2
|
||||
df[col] = np.log(df[col])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", MaxAbsScale)
|
||||
def max_abs_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = MaxAbsScaler().fit_transform(df[features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", RobustScale)
|
||||
def robust_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = RobustScaler().fit_transform(df[features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", OrdinalEncode)
|
||||
def ordinal_encode(df: pd.DataFrame, features: list,):
|
||||
df[features] = OrdinalEncoder().fit_transform(df[features])
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def run():
|
||||
V = {
|
||||
'a': [-1, 2, 3, 6, 5, 4],
|
||||
'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
|
||||
'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
|
||||
'd': [1, None, 3, None, 5, 4],
|
||||
'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
|
||||
'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
|
||||
|
||||
}
|
||||
|
||||
df = pd.DataFrame(V)
|
||||
print(df.dtypes)
|
||||
|
||||
numeric_features = ['a', 'b', 'd', 'e']
|
||||
numeric_features_wo_miss = ['a', 'b', ]
|
||||
categorial_features = ['c', 'f']
|
||||
|
||||
df_ = fill_missing_value(df.copy(), numeric_features)
|
||||
print(df_)
|
||||
df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
|
||||
print(df_)
|
||||
|
||||
df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
|
||||
print(df_)
|
||||
|
||||
# df_ = label_encode(df.copy(), numeric_features + categorial_features, )
|
||||
# print(df_)
|
||||
|
||||
df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
|
||||
print(df_)
|
||||
|
||||
df_ = min_max_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = standard_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = log_transform(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = max_abs_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = robust_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
run()
|
||||
196
metagpt/tools/functions/libs/ml_model.py
Normal file
196
metagpt/tools/functions/libs/ml_model.py
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
|
||||
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
|
||||
from metagpt.tools.functions import registry
|
||||
from metagpt.tools.functions.schemas.ml_model import *
|
||||
|
||||
|
||||
#########
|
||||
## 分类 ##
|
||||
#########
|
||||
|
||||
|
||||
@registry.register("classification_model", LogisticRegressionClassification)
|
||||
def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
|
||||
model = LogisticRegression(penalty=penalty, dual=dual)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict_proba(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
@registry.register("classification_model", RandomForestClassification)
|
||||
def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict_proba(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
@registry.register("classification_model", GradientBoostingClassification)
|
||||
def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict_proba(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
|
||||
#########
|
||||
## 回归 ##
|
||||
#########
|
||||
|
||||
|
||||
@registry.register("regression_model", LinearRegressionRegression)
|
||||
def linear_regression(df, label, test_size=0.2, ):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
|
||||
model = LinearRegression()
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
@registry.register("regression_model", RandomForestRegression)
|
||||
def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
@registry.register("regression_model", GradientBoostingRegression)
|
||||
def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def run():
|
||||
from sklearn.datasets import load_iris
|
||||
loader = load_iris(as_frame=True)
|
||||
df = loader['data']
|
||||
df['target'] = loader['target']
|
||||
|
||||
df[df.columns[0]] = df[df.columns[0]].astype(str)
|
||||
df[df.columns[1]] = df[df.columns[1]].astype(int)
|
||||
df['target'] = df['target'].astype(str)
|
||||
|
||||
print(df)
|
||||
print('####'*5)
|
||||
res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False)
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
print('####'*5)
|
||||
res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini')
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
print('####'*5)
|
||||
res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
from sklearn.datasets import make_regression
|
||||
import pandas as pd
|
||||
loader = make_regression()
|
||||
df = pd.DataFrame(loader[0])
|
||||
df['target'] = loader[1]
|
||||
|
||||
df[df.columns[0]] = df[df.columns[0]].astype(str)
|
||||
df[df.columns[1]] = df[df.columns[1]].astype(int)
|
||||
# df['target'] = df['target'].astype(str)
|
||||
|
||||
print(df)
|
||||
print('####' * 5)
|
||||
res = linear_regression(df, 'target', test_size=0.25, )
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
print('####' * 5)
|
||||
res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error')
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
print('####' * 5)
|
||||
res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
|
||||
print(res['te_pred_prob'])
|
||||
run()
|
||||
62
metagpt/tools/functions/schemas/data_preprocess.py
Normal file
62
metagpt/tools/functions/schemas/data_preprocess.py
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
import pandas as pd
|
||||
|
||||
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
|
||||
|
||||
|
||||
class FillMissingValue(ToolSchema):
|
||||
"""Completing missing values with simple strategies"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
strategy: str = tool_field(description="the imputation strategy", default='mean')
|
||||
fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
|
||||
|
||||
|
||||
# class LabelEncode(ToolSchema):
|
||||
# """Completing missing values with simple strategies"""
|
||||
# df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
# features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class SplitBins(ToolSchema):
|
||||
"""Bin continuous data into intervals and return the bin identifier encoded as an integer value"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile')
|
||||
|
||||
|
||||
class MinMaxScale(ToolSchema):
|
||||
"""Transform features by scaling each feature to a range, witch is (0, 1)"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class StandardScale(ToolSchema):
|
||||
"""Standardize features by removing the mean and scaling to unit variance"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class LogTransform(ToolSchema):
|
||||
"""Performs a logarithmic transformation on the specified columns"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class MaxAbsScale(ToolSchema):
|
||||
"""Scale each feature by its maximum absolute value"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class RobustScale(ToolSchema):
|
||||
"""Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class OrdinalEncode(ToolSchema):
|
||||
"""Encode categorical features as an integer array"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
55
metagpt/tools/functions/schemas/ml_model.py
Normal file
55
metagpt/tools/functions/schemas/ml_model.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
import pandas as pd
|
||||
|
||||
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
|
||||
|
||||
|
||||
class LogisticRegressionClassification(ToolSchema):
|
||||
"""Logistic Regression (aka logit, MaxEnt) classifier"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
penalty: str = tool_field(description="Specify the norm of the penalty", default="l2")
|
||||
dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2")
|
||||
|
||||
|
||||
class RandomForestClassification(ToolSchema):
|
||||
"""random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
|
||||
criterion: str = tool_field(description="The function to measure the quality of a split", default="gini")
|
||||
|
||||
|
||||
class GradientBoostingClassification(ToolSchema):
|
||||
"""Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
|
||||
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
|
||||
|
||||
|
||||
class LinearRegressionRegression(ToolSchema):
|
||||
"""Ordinary least squares Linear Regression."""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
|
||||
|
||||
class RandomForestRegression(ToolSchema):
|
||||
"""random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
|
||||
criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error")
|
||||
|
||||
|
||||
class GradientBoostingRegression(ToolSchema):
|
||||
"""Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
|
||||
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
|
||||
Loading…
Add table
Add a link
Reference in a new issue