Merge branch 'dev' into 'dev_make_tools'

# Conflicts: # metagpt/roles/ml_engineer.py
2026-05-04 21:32:38 +02:00 · 2023-12-26 06:18:27 +00:00 · 2023-12-26 06:18:27 +00:00 · c76c1765ef
commit c76c1765ef
parent b49db2d62f 5ff1a747c2
6 changed files with 276 additions and 55 deletions
--- a/metagpt/prompts/ml_engineer.py
+++ b/metagpt/prompts/ml_engineer.py
@ -6,7 +6,7 @@
 # @Desc    :
 UPDATE_DATA_COLUMNS = """
 # Background
-Keep dataset column information updated to reflect changes in training or testing datasets, aiding in informed decision-making during data analysis.
+Keep dataset column information updated before model train.
 ## Done Tasks
 ```python
 {history_code}
@ -18,15 +18,13 @@ Update and print the dataset's column information only if the train or test data
 from metagpt.tools.functions.libs.data_preprocess import get_column_info

 column_info = get_column_info(df)
-print("df_column_info")
+print("column_info")
 print(column_info)
 ```end

 # Constraints:
 - Use the DataFrame variable from 'Done Tasks' in place of df.
 - Import `get_column_info` only if it's not already imported.
- Skip update if no changes in training/testing data, except for initial data load.
- No need to update info if only model evaluation is performed.
 """

 GEN_DATA_DESC_PROMPT = """
@ -185,7 +183,7 @@ ojb_cols = train.select_dtypes(include='object').columns.tolist()

 for col in obj_cols:
    encoder = LabelEncoder()
-    train[col] = encoder.fit_transform(train[col])
+    train[col] = encoder.fit_transform(train[col].unique().tolist() + ['unknown'])
    test[col] = test[col].apply(lambda x: x if x in encoder.classes_ else 'unknown')
    test[col] = encoder.transform(test[col])

@ -241,6 +239,8 @@ from metagpt.tools.functions.libs.data_preprocess import FillMissingValue
 train_processed = train.copy()
 test_processed = test.copy()
 num_cols = train_processed.select_dtypes(include='number').columns.tolist()
+if 'label' in num_cols:
+    num_cols.remove('label')
 fill_missing_value = FillMissingValue(features=num_cols, strategy='mean')
 fill_missing_value.fit(train_processed)
 train_processed = fill_missing_value.transform(train_processed)
@ -266,23 +266,29 @@ The current task is about data preprocessing, please note the following:
 - Monitor data types per column, applying appropriate methods.
 - Ensure operations are on existing dataset columns.
 - Avoid writing processed data to files.
+- Avoid any change to label column, such as standardization, etc.
 - Prefer alternatives to one-hot encoding for categorical data.
- Only encode necessary categorical columns to allow for potential feature-specific engineering tasks later.
+- Only encode or scale necessary columns to allow for potential feature-specific engineering tasks (like time_extract, binning, extraction, etc.) later.
+- Each step do data preprocessing to train, must do same for test separately at the same time.
 """

 FEATURE_ENGINEERING_PROMPT = """
 The current task is about feature engineering. when performing it, please adhere to the following principles:
- Ensure operations are on existing dataset columns and consider the data type (numerical, categorical, etc.) and application scenario (classification, regression tasks, etc.).
- Create impactful features based on real-world knowledge and column info.
- Generate as diverse features as possible to improve the model's performance.
+- Generate as diverse features as possible to improve the model's performance step-by-step. 
 - If potential impactful features are not included in 'Code Steps', add new steps to generate them.
+- Avoid creating redundant or excessively numerous features in one step.
+- Exclude ID columns from feature generation and remove them.
+- Each step do feature engineering to train, must do same for test separately at the same time.
+- Avoid using the label column to create features, except for cat encoding.
+- Use the data from previous task result if exist, do not mock or reload data yourself.
 """

 MODEL_TRAIN_PROMPT = """
 The current task is about training a model, please ensure high performance:
 - Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as lightGBM, XGBoost, CatBoost, etc.
- Before training, first check not is_numeric_dtype columns and use label encoding to convert them to numeric columns.
+- If non-numeric columns exist, perform label encode together with all steps.
 - Use the data from previous task result directly, do not mock or reload data yourself.
+- Set suitable hyperparameters for the model, make metrics as high as possible.
 """

 MODEL_EVALUATE_PROMPT = """
--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@ -84,7 +84,7 @@ class MLEngineer(Role):
                self.plan.finish_current_task()
                self.working_memory.clear()

-                if self.use_tools or self.use_udfs:
+                if (self.use_tools and task.task_type not in ['model_train', 'model_evaluate']) or self.use_udfs:
                    success, new_code = await self._update_data_columns()
                    if success:
                        task.code = task.code + "\n\n" + new_code
@ -123,6 +123,7 @@ class MLEngineer(Role):
        if is_update:
            result, success = await self.execute_code.run(code)
            if success:
+                print(result)
                self.data_desc["column_info"] = result
        return success, code
    
@ -339,7 +340,7 @@ if __name__ == "__main__":
    # requirement = f"This is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions.The target column is Class. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report f1 score on the eval data. Train data path: {data_path}/split_train.csv, eval data path: {data_path}/split_eval.csv."
    
    # data_path = f"{DATA_PATH}/santander-customer-transaction-prediction"
-    # requirement = f"This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report F1 Score on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv' ."
+    # requirement = f"This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report AUC Score on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv' ."
    
    # data_path = f"{DATA_PATH}/house-prices-advanced-regression-techniques"
    # requirement = f"This is a house price dataset, your goal is to predict the sale price of a property based on its features. The target column is SalePrice. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sales price on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@ -1,3 +1,5 @@
+import json
+
 import numpy as np
 import pandas as pd
 from sklearn.impute import SimpleImputer
@ -20,10 +22,14 @@ class FillMissingValue(MLProcess):
        self.si = None

    def fit(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return
        self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
        self.si.fit(df[self.features])

    def transform(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return df
        df[self.features] = self.si.transform(df[self.features])
        return df

@ -122,11 +128,15 @@ class LabelEncode(MLProcess):
        self.le_encoders = []

    def fit(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return
        for col in self.features:
            le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
            self.le_encoders.append(le)

    def transform(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return df
        for i in range(len(self.features)):
            data_list = df[self.features[i]].astype(str).tolist()
            for unique_item in np.unique(df[self.features[i]].astype(str)):
@ -137,17 +147,23 @@ class LabelEncode(MLProcess):


 def get_column_info(df: pd.DataFrame) -> dict:
-    data = []
-    for i in df.columns:
-        nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
-        n_unique = df[i].nunique()
-        data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "")
-        if data_type == "O":
-            data_type = "object"
-        data.append([i, data_type, nan_freq, n_unique])
+    column_info = {
+        "Category": [],
+        "Numeric": [],
+        "Datetime": [],
+        "Others": [],
+    }
+    for col in df.columns:
+        data_type = str(df[col].dtype).replace("dtype('", "").replace("')", "")
+        if data_type.startswith("object"):
+            column_info["Category"].append(col)
+        elif data_type.startswith("int") or data_type.startswith("float"):
+            column_info["Numeric"].append(col)
+        elif data_type.startswith("datetime"):
+            column_info["Datetime"].append(col)
+        else:
+            column_info["Others"].append(col)

-    samples = pd.DataFrame(
-        data,
-        columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
-    )
-    return samples.to_dict(orient='list')
+    if len(json.dumps(column_info)) > 2000:
+        column_info['Numeric'] = column_info['Numeric'][0:5] + ['Too many cols, omission here...']
+    return column_info
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@ -6,12 +6,12 @@
 # @Desc    : Feature Engineering Tools
 import itertools

+import lightgbm as lgb
 import numpy as np
 import pandas as pd
-from dateutil.relativedelta import relativedelta
 from joblib import Parallel, delayed
-from pandas.api.types import is_numeric_dtype
 from pandas.core.dtypes.common import is_object_dtype
+from sklearn.feature_selection import VarianceThreshold
 from sklearn.model_selection import KFold
 from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer

@ -19,15 +19,27 @@ from metagpt.tools.functions.libs.base import MLProcess


 class PolynomialExpansion(MLProcess):
-    def __init__(self, cols: list, degree: int = 2):
+    def __init__(self, cols: list, degree: int = 2, label_col: str = None):
        self.cols = cols
        self.degree = degree
+        self.label_col = label_col
+        if self.label_col in self.cols:
+            self.cols.remove(self.label_col)
        self.poly = PolynomialFeatures(degree=degree, include_bias=False)

    def fit(self, df: pd.DataFrame):
+        if len(self.cols) == 0:
+            return
+        if len(self.cols) > 10:
+            corr = df[self.cols + [self.label_col]].corr()
+            corr = corr[self.label_col].abs().sort_values(ascending=False)
+            self.cols = corr.index.tolist()[1:11]
+
        self.poly.fit(df[self.cols].fillna(0))

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        if len(self.cols) == 0:
+            return df
        ts_data = self.poly.transform(df[self.cols].fillna(0))
        column_name = self.poly.get_feature_names_out(self.cols)
        ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
@ -158,27 +170,35 @@ class SplitBins(MLProcess):
        df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
        return df

-# @registry.register("feature_engineering", ExtractTimeComps)
-# def extract_time_comps(df, time_col, time_comps):
-#     time_s = pd.to_datetime(df[time_col], errors="coerce")
-#     time_comps_df = pd.DataFrame()
-#
-#     if "year" in time_comps:
-#         time_comps_df["year"] = time_s.dt.year
-#     if "month" in time_comps:
-#         time_comps_df["month"] = time_s.dt.month
-#     if "day" in time_comps:
-#         time_comps_df["day"] = time_s.dt.day
-#     if "hour" in time_comps:
-#         time_comps_df["hour"] = time_s.dt.hour
-#     if "dayofweek" in time_comps:
-#         time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
-#     if "is_weekend" in time_comps:
-#         time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
-#     df = pd.concat([df, time_comps_df], axis=1)
-#     return df
-#
-#
+
+class ExtractTimeComps(MLProcess):
+    def __init__(self, time_col: str, time_comps: list):
+        self.time_col = time_col
+        self.time_comps = time_comps
+
+    def fit(self, df: pd.DataFrame):
+        pass
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        time_s = pd.to_datetime(df[self.time_col], errors="coerce")
+        time_comps_df = pd.DataFrame()
+
+        if "year" in self.time_comps:
+            time_comps_df["year"] = time_s.dt.year
+        if "month" in self.time_comps:
+            time_comps_df["month"] = time_s.dt.month
+        if "day" in self.time_comps:
+            time_comps_df["day"] = time_s.dt.day
+        if "hour" in self.time_comps:
+            time_comps_df["hour"] = time_s.dt.hour
+        if "dayofweek" in self.time_comps:
+            time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
+        if "is_weekend" in self.time_comps:
+            time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
+        df = pd.concat([df, time_comps_df], axis=1)
+        return df
+
+
 # @registry.register("feature_engineering", FeShiftByTime)
 # def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
 #     df[time_col] = pd.to_datetime(df[time_col])
@ -290,3 +310,66 @@ class GeneralSelection(MLProcess):
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df[self.feats + [self.label_col]]
        return df
+
+
+class TreeBasedSelection(MLProcess):
+    def __init__(self, label_col: str, task_type: str):
+        self.label_col = label_col
+        self.task_type = task_type
+        self.feats = None
+
+    def fit(self, df: pd.DataFrame):
+        params = {
+            'boosting_type': 'gbdt',
+            'objective': 'binary',
+            'learning_rate': 0.1,
+            'num_leaves': 31,
+        }
+
+        if self.task_type == "cls":
+            params["objective"] = "binary"
+            params["metric"] = "auc"
+        elif self.task_type == "mcls":
+            params["objective"] = "multiclass"
+            params["num_class"] = df[self.label_col].nunique()
+            params["metric"] = "auc_mu"
+        elif self.task_type == "reg":
+            params["objective"] = "regression"
+            params["metric"] = "rmse"
+
+        num_cols = df.select_dtypes(include=np.number).columns.tolist()
+        cols = [f for f in num_cols if f not in [self.label_col]]
+
+        dtrain = lgb.Dataset(df[cols], df[self.label_col])
+        model = lgb.train(params, dtrain, num_boost_round=100)
+        df_imp = pd.DataFrame({'feature_name': dtrain.feature_name,
+                               'importance': model.feature_importance("gain")})
+
+        df_imp.sort_values("importance", ascending=False, inplace=True)
+        df_imp = df_imp[df_imp["importance"] > 0]
+        self.feats = df_imp['feature_name'].tolist()
+        self.feats.append(self.label_col)
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df[self.feats]
+        return df
+
+
+class VarianceBasedSelection(MLProcess):
+    def __init__(self, label_col: str, threshold: float = 0):
+        self.label_col = label_col
+        self.threshold = threshold
+        self.feats = None
+        self.selector = VarianceThreshold(threshold=self.threshold)
+
+    def fit(self, df: pd.DataFrame):
+        num_cols = df.select_dtypes(include=np.number).columns.tolist()
+        cols = [f for f in num_cols if f not in [self.label_col]]
+
+        self.selector.fit(df[cols])
+        self.feats = df[cols].columns[self.selector.get_support(indices=True)].tolist()
+        self.feats.append(self.label_col)
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df[self.feats]
+        return df
--- a/metagpt/tools/functions/schemas/data_preprocess.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess.yml
@ -11,7 +11,7 @@ FillMissingValue:
            description: "columns to be processed"
          strategy:
            type: str
-            description: "the imputation strategy"
+            description: "the imputation strategy, notice mean/median can only be used for numeric features"
            default: mean
            enum:
              - mean
--- a/metagpt/tools/functions/schemas/feature_engineering.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering.yml
@ -1,6 +1,6 @@
 PolynomialExpansion:
  type: class
-  description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column."
+  description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
  methods:
    __init__:
      description: "Initialize self."
@ -9,12 +9,16 @@ PolynomialExpansion:
          cols:
            type: list
            description: "Columns for polynomial expansion."
+          label_col:
+            type: str
+            description: "Label column name."
          degree:
            type: int
            description: "The degree of the polynomial features."
            default: 2
        required:
          - cols
+          - label_col
    fit:
      description: "Fit the PolynomialExpansion model."
      parameters:
@ -36,14 +40,14 @@ PolynomialExpansion:
      returns:
        df:
          type: DataFrame
-          description: "The transformed DataFrame."
+          description: "The transformed DataFrame without duplicated columns."
    fit_transform:
      description: "Fit and transform the input DataFrame."
      parameters:
        properties:
          df:
            type: DataFrame
-            description: "The input DataFrame."
+            description: "The input DataFrame without duplicated columns."
        required:
          - df
      returns:
@ -224,7 +228,7 @@ CatCross:
        properties:
          cols:
            type: list
-            description: "Columns to be pairwise crossed."
+            description: "Columns to be pairwise crossed, at least 2 columns."
          max_cat_num:
            type: int
            description: "Maximum unique categories per crossed feature."
@ -430,4 +434,115 @@ GeneralSelection:
      returns:
        df:
          type: DataFrame
-          description: "The transformed DataFrame."
+          description: "The transformed DataFrame."
+
+
+TreeBasedSelection:
+  type: class
+  description: "Select features based on tree-based model and remove features with low importance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          task_type:
+            type: str
+            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
+            enum:
+              - cls
+              - mcls
+              - reg
+        required:
+          - label_col
+          - task_type
+    fit:
+      description: "Fit the TreeBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+
+VarianceBasedSelection:
+  type: class
+  description: "Select features based on variance and remove features with low variance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          threshold:
+            type: float
+            description: "Threshold for variance."
+            default: 0.0
+        required:
+          - label_col
+    fit:
+      description: "Fit the VarianceBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."