diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py index cca9649b3..6af40bf97 100644 --- a/metagpt/prompts/ml_engineer.py +++ b/metagpt/prompts/ml_engineer.py @@ -6,7 +6,7 @@ # @Desc : UPDATE_DATA_COLUMNS = """ # Background -Keep dataset column information updated to reflect changes in training or testing datasets, aiding in informed decision-making during data analysis. +Keep dataset column information updated before model train. ## Done Tasks ```python {history_code} @@ -18,15 +18,13 @@ Update and print the dataset's column information only if the train or test data from metagpt.tools.functions.libs.data_preprocess import get_column_info column_info = get_column_info(df) -print("df_column_info") +print("column_info") print(column_info) ```end # Constraints: - Use the DataFrame variable from 'Done Tasks' in place of df. - Import `get_column_info` only if it's not already imported. -- Skip update if no changes in training/testing data, except for initial data load. -- No need to update info if only model evaluation is performed. """ GEN_DATA_DESC_PROMPT = """ @@ -185,7 +183,7 @@ ojb_cols = train.select_dtypes(include='object').columns.tolist() for col in obj_cols: encoder = LabelEncoder() - train[col] = encoder.fit_transform(train[col]) + train[col] = encoder.fit_transform(train[col].unique().tolist() + ['unknown']) test[col] = test[col].apply(lambda x: x if x in encoder.classes_ else 'unknown') test[col] = encoder.transform(test[col]) @@ -241,6 +239,8 @@ from metagpt.tools.functions.libs.data_preprocess import FillMissingValue train_processed = train.copy() test_processed = test.copy() num_cols = train_processed.select_dtypes(include='number').columns.tolist() +if 'label' in num_cols: + num_cols.remove('label') fill_missing_value = FillMissingValue(features=num_cols, strategy='mean') fill_missing_value.fit(train_processed) train_processed = fill_missing_value.transform(train_processed) @@ -266,23 +266,29 @@ The current task is about data preprocessing, please note the following: - Monitor data types per column, applying appropriate methods. - Ensure operations are on existing dataset columns. - Avoid writing processed data to files. +- Avoid any change to label column, such as standardization, etc. - Prefer alternatives to one-hot encoding for categorical data. -- Only encode necessary categorical columns to allow for potential feature-specific engineering tasks later. +- Only encode or scale necessary columns to allow for potential feature-specific engineering tasks (like time_extract, binning, extraction, etc.) later. +- Each step do data preprocessing to train, must do same for test separately at the same time. """ FEATURE_ENGINEERING_PROMPT = """ The current task is about feature engineering. when performing it, please adhere to the following principles: -- Ensure operations are on existing dataset columns and consider the data type (numerical, categorical, etc.) and application scenario (classification, regression tasks, etc.). -- Create impactful features based on real-world knowledge and column info. -- Generate as diverse features as possible to improve the model's performance. +- Generate as diverse features as possible to improve the model's performance step-by-step. - If potential impactful features are not included in 'Code Steps', add new steps to generate them. +- Avoid creating redundant or excessively numerous features in one step. +- Exclude ID columns from feature generation and remove them. +- Each step do feature engineering to train, must do same for test separately at the same time. +- Avoid using the label column to create features, except for cat encoding. +- Use the data from previous task result if exist, do not mock or reload data yourself. """ MODEL_TRAIN_PROMPT = """ The current task is about training a model, please ensure high performance: - Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as lightGBM, XGBoost, CatBoost, etc. -- Before training, first check not is_numeric_dtype columns and use label encoding to convert them to numeric columns. +- If non-numeric columns exist, perform label encode together with all steps. - Use the data from previous task result directly, do not mock or reload data yourself. +- Set suitable hyperparameters for the model, make metrics as high as possible. """ MODEL_EVALUATE_PROMPT = """ diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py index b991d9329..6b90ae9c4 100644 --- a/metagpt/roles/ml_engineer.py +++ b/metagpt/roles/ml_engineer.py @@ -84,7 +84,7 @@ class MLEngineer(Role): self.plan.finish_current_task() self.working_memory.clear() - if self.use_tools or self.use_udfs: + if (self.use_tools and task.task_type not in ['model_train', 'model_evaluate']) or self.use_udfs: success, new_code = await self._update_data_columns() if success: task.code = task.code + "\n\n" + new_code @@ -123,6 +123,7 @@ class MLEngineer(Role): if is_update: result, success = await self.execute_code.run(code) if success: + print(result) self.data_desc["column_info"] = result return success, code @@ -339,7 +340,7 @@ if __name__ == "__main__": # requirement = f"This is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions.The target column is Class. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report f1 score on the eval data. Train data path: {data_path}/split_train.csv, eval data path: {data_path}/split_eval.csv." # data_path = f"{DATA_PATH}/santander-customer-transaction-prediction" - # requirement = f"This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report F1 Score on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv' ." + # requirement = f"This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report AUC Score on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv' ." # data_path = f"{DATA_PATH}/house-prices-advanced-regression-techniques" # requirement = f"This is a house price dataset, your goal is to predict the sale price of a property based on its features. The target column is SalePrice. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sales price on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'." diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py index 8c70462ee..f1665b405 100644 --- a/metagpt/tools/functions/libs/data_preprocess.py +++ b/metagpt/tools/functions/libs/data_preprocess.py @@ -1,3 +1,5 @@ +import json + import numpy as np import pandas as pd from sklearn.impute import SimpleImputer @@ -20,10 +22,14 @@ class FillMissingValue(MLProcess): self.si = None def fit(self, df: pd.DataFrame): + if len(self.features) == 0: + return self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value) self.si.fit(df[self.features]) def transform(self, df: pd.DataFrame): + if len(self.features) == 0: + return df df[self.features] = self.si.transform(df[self.features]) return df @@ -122,11 +128,15 @@ class LabelEncode(MLProcess): self.le_encoders = [] def fit(self, df: pd.DataFrame): + if len(self.features) == 0: + return for col in self.features: le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown']) self.le_encoders.append(le) def transform(self, df: pd.DataFrame): + if len(self.features) == 0: + return df for i in range(len(self.features)): data_list = df[self.features[i]].astype(str).tolist() for unique_item in np.unique(df[self.features[i]].astype(str)): @@ -137,17 +147,23 @@ class LabelEncode(MLProcess): def get_column_info(df: pd.DataFrame) -> dict: - data = [] - for i in df.columns: - nan_freq = float("%.2g" % (df[i].isna().mean() * 100)) - n_unique = df[i].nunique() - data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "") - if data_type == "O": - data_type = "object" - data.append([i, data_type, nan_freq, n_unique]) + column_info = { + "Category": [], + "Numeric": [], + "Datetime": [], + "Others": [], + } + for col in df.columns: + data_type = str(df[col].dtype).replace("dtype('", "").replace("')", "") + if data_type.startswith("object"): + column_info["Category"].append(col) + elif data_type.startswith("int") or data_type.startswith("float"): + column_info["Numeric"].append(col) + elif data_type.startswith("datetime"): + column_info["Datetime"].append(col) + else: + column_info["Others"].append(col) - samples = pd.DataFrame( - data, - columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"], - ) - return samples.to_dict(orient='list') + if len(json.dumps(column_info)) > 2000: + column_info['Numeric'] = column_info['Numeric'][0:5] + ['Too many cols, omission here...'] + return column_info diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py index 1ec2b9675..df36752b9 100644 --- a/metagpt/tools/functions/libs/feature_engineering.py +++ b/metagpt/tools/functions/libs/feature_engineering.py @@ -6,12 +6,12 @@ # @Desc : Feature Engineering Tools import itertools +import lightgbm as lgb import numpy as np import pandas as pd -from dateutil.relativedelta import relativedelta from joblib import Parallel, delayed -from pandas.api.types import is_numeric_dtype from pandas.core.dtypes.common import is_object_dtype +from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import KFold from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer @@ -19,15 +19,27 @@ from metagpt.tools.functions.libs.base import MLProcess class PolynomialExpansion(MLProcess): - def __init__(self, cols: list, degree: int = 2): + def __init__(self, cols: list, degree: int = 2, label_col: str = None): self.cols = cols self.degree = degree + self.label_col = label_col + if self.label_col in self.cols: + self.cols.remove(self.label_col) self.poly = PolynomialFeatures(degree=degree, include_bias=False) def fit(self, df: pd.DataFrame): + if len(self.cols) == 0: + return + if len(self.cols) > 10: + corr = df[self.cols + [self.label_col]].corr() + corr = corr[self.label_col].abs().sort_values(ascending=False) + self.cols = corr.index.tolist()[1:11] + self.poly.fit(df[self.cols].fillna(0)) def transform(self, df: pd.DataFrame) -> pd.DataFrame: + if len(self.cols) == 0: + return df ts_data = self.poly.transform(df[self.cols].fillna(0)) column_name = self.poly.get_feature_names_out(self.cols) ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name) @@ -158,27 +170,35 @@ class SplitBins(MLProcess): df[self.cols] = self.encoder.transform(df[self.cols].fillna(0)) return df -# @registry.register("feature_engineering", ExtractTimeComps) -# def extract_time_comps(df, time_col, time_comps): -# time_s = pd.to_datetime(df[time_col], errors="coerce") -# time_comps_df = pd.DataFrame() -# -# if "year" in time_comps: -# time_comps_df["year"] = time_s.dt.year -# if "month" in time_comps: -# time_comps_df["month"] = time_s.dt.month -# if "day" in time_comps: -# time_comps_df["day"] = time_s.dt.day -# if "hour" in time_comps: -# time_comps_df["hour"] = time_s.dt.hour -# if "dayofweek" in time_comps: -# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1 -# if "is_weekend" in time_comps: -# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int) -# df = pd.concat([df, time_comps_df], axis=1) -# return df -# -# + +class ExtractTimeComps(MLProcess): + def __init__(self, time_col: str, time_comps: list): + self.time_col = time_col + self.time_comps = time_comps + + def fit(self, df: pd.DataFrame): + pass + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + time_s = pd.to_datetime(df[self.time_col], errors="coerce") + time_comps_df = pd.DataFrame() + + if "year" in self.time_comps: + time_comps_df["year"] = time_s.dt.year + if "month" in self.time_comps: + time_comps_df["month"] = time_s.dt.month + if "day" in self.time_comps: + time_comps_df["day"] = time_s.dt.day + if "hour" in self.time_comps: + time_comps_df["hour"] = time_s.dt.hour + if "dayofweek" in self.time_comps: + time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1 + if "is_weekend" in self.time_comps: + time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int) + df = pd.concat([df, time_comps_df], axis=1) + return df + + # @registry.register("feature_engineering", FeShiftByTime) # def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq): # df[time_col] = pd.to_datetime(df[time_col]) @@ -290,3 +310,66 @@ class GeneralSelection(MLProcess): def transform(self, df: pd.DataFrame) -> pd.DataFrame: df = df[self.feats + [self.label_col]] return df + + +class TreeBasedSelection(MLProcess): + def __init__(self, label_col: str, task_type: str): + self.label_col = label_col + self.task_type = task_type + self.feats = None + + def fit(self, df: pd.DataFrame): + params = { + 'boosting_type': 'gbdt', + 'objective': 'binary', + 'learning_rate': 0.1, + 'num_leaves': 31, + } + + if self.task_type == "cls": + params["objective"] = "binary" + params["metric"] = "auc" + elif self.task_type == "mcls": + params["objective"] = "multiclass" + params["num_class"] = df[self.label_col].nunique() + params["metric"] = "auc_mu" + elif self.task_type == "reg": + params["objective"] = "regression" + params["metric"] = "rmse" + + num_cols = df.select_dtypes(include=np.number).columns.tolist() + cols = [f for f in num_cols if f not in [self.label_col]] + + dtrain = lgb.Dataset(df[cols], df[self.label_col]) + model = lgb.train(params, dtrain, num_boost_round=100) + df_imp = pd.DataFrame({'feature_name': dtrain.feature_name, + 'importance': model.feature_importance("gain")}) + + df_imp.sort_values("importance", ascending=False, inplace=True) + df_imp = df_imp[df_imp["importance"] > 0] + self.feats = df_imp['feature_name'].tolist() + self.feats.append(self.label_col) + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df = df[self.feats] + return df + + +class VarianceBasedSelection(MLProcess): + def __init__(self, label_col: str, threshold: float = 0): + self.label_col = label_col + self.threshold = threshold + self.feats = None + self.selector = VarianceThreshold(threshold=self.threshold) + + def fit(self, df: pd.DataFrame): + num_cols = df.select_dtypes(include=np.number).columns.tolist() + cols = [f for f in num_cols if f not in [self.label_col]] + + self.selector.fit(df[cols]) + self.feats = df[cols].columns[self.selector.get_support(indices=True)].tolist() + self.feats.append(self.label_col) + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df = df[self.feats] + return df diff --git a/metagpt/tools/functions/schemas/data_preprocess.yml b/metagpt/tools/functions/schemas/data_preprocess.yml index 95b0124cc..4de697abd 100644 --- a/metagpt/tools/functions/schemas/data_preprocess.yml +++ b/metagpt/tools/functions/schemas/data_preprocess.yml @@ -11,7 +11,7 @@ FillMissingValue: description: "columns to be processed" strategy: type: str - description: "the imputation strategy" + description: "the imputation strategy, notice mean/median can only be used for numeric features" default: mean enum: - mean diff --git a/metagpt/tools/functions/schemas/feature_engineering.yml b/metagpt/tools/functions/schemas/feature_engineering.yml index 3ba9e863b..62e6ad5b3 100644 --- a/metagpt/tools/functions/schemas/feature_engineering.yml +++ b/metagpt/tools/functions/schemas/feature_engineering.yml @@ -1,6 +1,6 @@ PolynomialExpansion: type: class - description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column." + description: "Add polynomial and interaction features from selected numeric columns to input DataFrame." methods: __init__: description: "Initialize self." @@ -9,12 +9,16 @@ PolynomialExpansion: cols: type: list description: "Columns for polynomial expansion." + label_col: + type: str + description: "Label column name." degree: type: int description: "The degree of the polynomial features." default: 2 required: - cols + - label_col fit: description: "Fit the PolynomialExpansion model." parameters: @@ -36,14 +40,14 @@ PolynomialExpansion: returns: df: type: DataFrame - description: "The transformed DataFrame." + description: "The transformed DataFrame without duplicated columns." fit_transform: description: "Fit and transform the input DataFrame." parameters: properties: df: type: DataFrame - description: "The input DataFrame." + description: "The input DataFrame without duplicated columns." required: - df returns: @@ -224,7 +228,7 @@ CatCross: properties: cols: type: list - description: "Columns to be pairwise crossed." + description: "Columns to be pairwise crossed, at least 2 columns." max_cat_num: type: int description: "Maximum unique categories per crossed feature." @@ -430,4 +434,115 @@ GeneralSelection: returns: df: type: DataFrame - description: "The transformed DataFrame." \ No newline at end of file + description: "The transformed DataFrame." + + +TreeBasedSelection: + type: class + description: "Select features based on tree-based model and remove features with low importance." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + label_col: + type: str + description: "Label column name." + task_type: + type: str + description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression." + enum: + - cls + - mcls + - reg + required: + - label_col + - task_type + fit: + description: "Fit the TreeBasedSelection model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame contain label_col." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame contain label_col." + +VarianceBasedSelection: + type: class + description: "Select features based on variance and remove features with low variance." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + label_col: + type: str + description: "Label column name." + threshold: + type: float + description: "Threshold for variance." + default: 0.0 + required: + - label_col + fit: + description: "Fit the VarianceBasedSelection model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame contain label_col." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame contain label_col." \ No newline at end of file