Merge branch 'dev' into 'dev_make_tools'

# Conflicts:
#   metagpt/roles/ml_engineer.py
This commit is contained in:
刘棒棒 2023-12-26 06:18:27 +00:00
commit c76c1765ef
6 changed files with 276 additions and 55 deletions

View file

@ -6,7 +6,7 @@
# @Desc :
UPDATE_DATA_COLUMNS = """
# Background
Keep dataset column information updated to reflect changes in training or testing datasets, aiding in informed decision-making during data analysis.
Keep dataset column information updated before model train.
## Done Tasks
```python
{history_code}
@ -18,15 +18,13 @@ Update and print the dataset's column information only if the train or test data
from metagpt.tools.functions.libs.data_preprocess import get_column_info
column_info = get_column_info(df)
print("df_column_info")
print("column_info")
print(column_info)
```end
# Constraints:
- Use the DataFrame variable from 'Done Tasks' in place of df.
- Import `get_column_info` only if it's not already imported.
- Skip update if no changes in training/testing data, except for initial data load.
- No need to update info if only model evaluation is performed.
"""
GEN_DATA_DESC_PROMPT = """
@ -185,7 +183,7 @@ ojb_cols = train.select_dtypes(include='object').columns.tolist()
for col in obj_cols:
encoder = LabelEncoder()
train[col] = encoder.fit_transform(train[col])
train[col] = encoder.fit_transform(train[col].unique().tolist() + ['unknown'])
test[col] = test[col].apply(lambda x: x if x in encoder.classes_ else 'unknown')
test[col] = encoder.transform(test[col])
@ -241,6 +239,8 @@ from metagpt.tools.functions.libs.data_preprocess import FillMissingValue
train_processed = train.copy()
test_processed = test.copy()
num_cols = train_processed.select_dtypes(include='number').columns.tolist()
if 'label' in num_cols:
num_cols.remove('label')
fill_missing_value = FillMissingValue(features=num_cols, strategy='mean')
fill_missing_value.fit(train_processed)
train_processed = fill_missing_value.transform(train_processed)
@ -266,23 +266,29 @@ The current task is about data preprocessing, please note the following:
- Monitor data types per column, applying appropriate methods.
- Ensure operations are on existing dataset columns.
- Avoid writing processed data to files.
- Avoid any change to label column, such as standardization, etc.
- Prefer alternatives to one-hot encoding for categorical data.
- Only encode necessary categorical columns to allow for potential feature-specific engineering tasks later.
- Only encode or scale necessary columns to allow for potential feature-specific engineering tasks (like time_extract, binning, extraction, etc.) later.
- Each step do data preprocessing to train, must do same for test separately at the same time.
"""
FEATURE_ENGINEERING_PROMPT = """
The current task is about feature engineering. when performing it, please adhere to the following principles:
- Ensure operations are on existing dataset columns and consider the data type (numerical, categorical, etc.) and application scenario (classification, regression tasks, etc.).
- Create impactful features based on real-world knowledge and column info.
- Generate as diverse features as possible to improve the model's performance.
- Generate as diverse features as possible to improve the model's performance step-by-step.
- If potential impactful features are not included in 'Code Steps', add new steps to generate them.
- Avoid creating redundant or excessively numerous features in one step.
- Exclude ID columns from feature generation and remove them.
- Each step do feature engineering to train, must do same for test separately at the same time.
- Avoid using the label column to create features, except for cat encoding.
- Use the data from previous task result if exist, do not mock or reload data yourself.
"""
MODEL_TRAIN_PROMPT = """
The current task is about training a model, please ensure high performance:
- Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as lightGBM, XGBoost, CatBoost, etc.
- Before training, first check not is_numeric_dtype columns and use label encoding to convert them to numeric columns.
- If non-numeric columns exist, perform label encode together with all steps.
- Use the data from previous task result directly, do not mock or reload data yourself.
- Set suitable hyperparameters for the model, make metrics as high as possible.
"""
MODEL_EVALUATE_PROMPT = """

View file

@ -84,7 +84,7 @@ class MLEngineer(Role):
self.plan.finish_current_task()
self.working_memory.clear()
if self.use_tools or self.use_udfs:
if (self.use_tools and task.task_type not in ['model_train', 'model_evaluate']) or self.use_udfs:
success, new_code = await self._update_data_columns()
if success:
task.code = task.code + "\n\n" + new_code
@ -123,6 +123,7 @@ class MLEngineer(Role):
if is_update:
result, success = await self.execute_code.run(code)
if success:
print(result)
self.data_desc["column_info"] = result
return success, code
@ -339,7 +340,7 @@ if __name__ == "__main__":
# requirement = f"This is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions.The target column is Class. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report f1 score on the eval data. Train data path: {data_path}/split_train.csv, eval data path: {data_path}/split_eval.csv."
# data_path = f"{DATA_PATH}/santander-customer-transaction-prediction"
# requirement = f"This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report F1 Score on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv' ."
# requirement = f"This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report AUC Score on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv' ."
# data_path = f"{DATA_PATH}/house-prices-advanced-regression-techniques"
# requirement = f"This is a house price dataset, your goal is to predict the sale price of a property based on its features. The target column is SalePrice. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sales price on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."

View file

@ -1,3 +1,5 @@
import json
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
@ -20,10 +22,14 @@ class FillMissingValue(MLProcess):
self.si = None
def fit(self, df: pd.DataFrame):
if len(self.features) == 0:
return
self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
self.si.fit(df[self.features])
def transform(self, df: pd.DataFrame):
if len(self.features) == 0:
return df
df[self.features] = self.si.transform(df[self.features])
return df
@ -122,11 +128,15 @@ class LabelEncode(MLProcess):
self.le_encoders = []
def fit(self, df: pd.DataFrame):
if len(self.features) == 0:
return
for col in self.features:
le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
self.le_encoders.append(le)
def transform(self, df: pd.DataFrame):
if len(self.features) == 0:
return df
for i in range(len(self.features)):
data_list = df[self.features[i]].astype(str).tolist()
for unique_item in np.unique(df[self.features[i]].astype(str)):
@ -137,17 +147,23 @@ class LabelEncode(MLProcess):
def get_column_info(df: pd.DataFrame) -> dict:
data = []
for i in df.columns:
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
n_unique = df[i].nunique()
data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "")
if data_type == "O":
data_type = "object"
data.append([i, data_type, nan_freq, n_unique])
column_info = {
"Category": [],
"Numeric": [],
"Datetime": [],
"Others": [],
}
for col in df.columns:
data_type = str(df[col].dtype).replace("dtype('", "").replace("')", "")
if data_type.startswith("object"):
column_info["Category"].append(col)
elif data_type.startswith("int") or data_type.startswith("float"):
column_info["Numeric"].append(col)
elif data_type.startswith("datetime"):
column_info["Datetime"].append(col)
else:
column_info["Others"].append(col)
samples = pd.DataFrame(
data,
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
)
return samples.to_dict(orient='list')
if len(json.dumps(column_info)) > 2000:
column_info['Numeric'] = column_info['Numeric'][0:5] + ['Too many cols, omission here...']
return column_info

View file

@ -6,12 +6,12 @@
# @Desc : Feature Engineering Tools
import itertools
import lightgbm as lgb
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from pandas.api.types import is_numeric_dtype
from pandas.core.dtypes.common import is_object_dtype
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
@ -19,15 +19,27 @@ from metagpt.tools.functions.libs.base import MLProcess
class PolynomialExpansion(MLProcess):
def __init__(self, cols: list, degree: int = 2):
def __init__(self, cols: list, degree: int = 2, label_col: str = None):
self.cols = cols
self.degree = degree
self.label_col = label_col
if self.label_col in self.cols:
self.cols.remove(self.label_col)
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
def fit(self, df: pd.DataFrame):
if len(self.cols) == 0:
return
if len(self.cols) > 10:
corr = df[self.cols + [self.label_col]].corr()
corr = corr[self.label_col].abs().sort_values(ascending=False)
self.cols = corr.index.tolist()[1:11]
self.poly.fit(df[self.cols].fillna(0))
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
if len(self.cols) == 0:
return df
ts_data = self.poly.transform(df[self.cols].fillna(0))
column_name = self.poly.get_feature_names_out(self.cols)
ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
@ -158,27 +170,35 @@ class SplitBins(MLProcess):
df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
return df
# @registry.register("feature_engineering", ExtractTimeComps)
# def extract_time_comps(df, time_col, time_comps):
# time_s = pd.to_datetime(df[time_col], errors="coerce")
# time_comps_df = pd.DataFrame()
#
# if "year" in time_comps:
# time_comps_df["year"] = time_s.dt.year
# if "month" in time_comps:
# time_comps_df["month"] = time_s.dt.month
# if "day" in time_comps:
# time_comps_df["day"] = time_s.dt.day
# if "hour" in time_comps:
# time_comps_df["hour"] = time_s.dt.hour
# if "dayofweek" in time_comps:
# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
# if "is_weekend" in time_comps:
# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
# df = pd.concat([df, time_comps_df], axis=1)
# return df
#
#
class ExtractTimeComps(MLProcess):
def __init__(self, time_col: str, time_comps: list):
self.time_col = time_col
self.time_comps = time_comps
def fit(self, df: pd.DataFrame):
pass
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
time_s = pd.to_datetime(df[self.time_col], errors="coerce")
time_comps_df = pd.DataFrame()
if "year" in self.time_comps:
time_comps_df["year"] = time_s.dt.year
if "month" in self.time_comps:
time_comps_df["month"] = time_s.dt.month
if "day" in self.time_comps:
time_comps_df["day"] = time_s.dt.day
if "hour" in self.time_comps:
time_comps_df["hour"] = time_s.dt.hour
if "dayofweek" in self.time_comps:
time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
if "is_weekend" in self.time_comps:
time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
df = pd.concat([df, time_comps_df], axis=1)
return df
# @registry.register("feature_engineering", FeShiftByTime)
# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
# df[time_col] = pd.to_datetime(df[time_col])
@ -290,3 +310,66 @@ class GeneralSelection(MLProcess):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[self.feats + [self.label_col]]
return df
class TreeBasedSelection(MLProcess):
def __init__(self, label_col: str, task_type: str):
self.label_col = label_col
self.task_type = task_type
self.feats = None
def fit(self, df: pd.DataFrame):
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.1,
'num_leaves': 31,
}
if self.task_type == "cls":
params["objective"] = "binary"
params["metric"] = "auc"
elif self.task_type == "mcls":
params["objective"] = "multiclass"
params["num_class"] = df[self.label_col].nunique()
params["metric"] = "auc_mu"
elif self.task_type == "reg":
params["objective"] = "regression"
params["metric"] = "rmse"
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cols = [f for f in num_cols if f not in [self.label_col]]
dtrain = lgb.Dataset(df[cols], df[self.label_col])
model = lgb.train(params, dtrain, num_boost_round=100)
df_imp = pd.DataFrame({'feature_name': dtrain.feature_name,
'importance': model.feature_importance("gain")})
df_imp.sort_values("importance", ascending=False, inplace=True)
df_imp = df_imp[df_imp["importance"] > 0]
self.feats = df_imp['feature_name'].tolist()
self.feats.append(self.label_col)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[self.feats]
return df
class VarianceBasedSelection(MLProcess):
def __init__(self, label_col: str, threshold: float = 0):
self.label_col = label_col
self.threshold = threshold
self.feats = None
self.selector = VarianceThreshold(threshold=self.threshold)
def fit(self, df: pd.DataFrame):
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cols = [f for f in num_cols if f not in [self.label_col]]
self.selector.fit(df[cols])
self.feats = df[cols].columns[self.selector.get_support(indices=True)].tolist()
self.feats.append(self.label_col)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[self.feats]
return df

View file

@ -11,7 +11,7 @@ FillMissingValue:
description: "columns to be processed"
strategy:
type: str
description: "the imputation strategy"
description: "the imputation strategy, notice mean/median can only be used for numeric features"
default: mean
enum:
- mean

View file

@ -1,6 +1,6 @@
PolynomialExpansion:
type: class
description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column."
description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
methods:
__init__:
description: "Initialize self."
@ -9,12 +9,16 @@ PolynomialExpansion:
cols:
type: list
description: "Columns for polynomial expansion."
label_col:
type: str
description: "Label column name."
degree:
type: int
description: "The degree of the polynomial features."
default: 2
required:
- cols
- label_col
fit:
description: "Fit the PolynomialExpansion model."
parameters:
@ -36,14 +40,14 @@ PolynomialExpansion:
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
description: "The transformed DataFrame without duplicated columns."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
description: "The input DataFrame without duplicated columns."
required:
- df
returns:
@ -224,7 +228,7 @@ CatCross:
properties:
cols:
type: list
description: "Columns to be pairwise crossed."
description: "Columns to be pairwise crossed, at least 2 columns."
max_cat_num:
type: int
description: "Maximum unique categories per crossed feature."
@ -430,4 +434,115 @@ GeneralSelection:
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
description: "The transformed DataFrame."
TreeBasedSelection:
type: class
description: "Select features based on tree-based model and remove features with low importance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
task_type:
type: str
description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
enum:
- cls
- mcls
- reg
required:
- label_col
- task_type
fit:
description: "Fit the TreeBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
VarianceBasedSelection:
type: class
description: "Select features based on variance and remove features with low variance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
threshold:
type: float
description: "Threshold for variance."
default: 0.0
required:
- label_col
fit:
description: "Fit the VarianceBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."