Merge branch 'dev' into 'dev_make_tools'

# Conflicts: # metagpt/roles/ml_engineer.py
2026-04-28 18:36:22 +02:00 · 2023-12-26 06:18:27 +00:00 · 2023-12-26 06:18:27 +00:00 · c76c1765ef
commit c76c1765ef
parent b49db2d62f 5ff1a747c2
6 changed files with 276 additions and 55 deletions
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@ -1,3 +1,5 @@
+import json
+
 import numpy as np
 import pandas as pd
 from sklearn.impute import SimpleImputer
@ -20,10 +22,14 @@ class FillMissingValue(MLProcess):
        self.si = None

    def fit(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return
        self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
        self.si.fit(df[self.features])

    def transform(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return df
        df[self.features] = self.si.transform(df[self.features])
        return df

@ -122,11 +128,15 @@ class LabelEncode(MLProcess):
        self.le_encoders = []

    def fit(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return
        for col in self.features:
            le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
            self.le_encoders.append(le)

    def transform(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return df
        for i in range(len(self.features)):
            data_list = df[self.features[i]].astype(str).tolist()
            for unique_item in np.unique(df[self.features[i]].astype(str)):
@ -137,17 +147,23 @@ class LabelEncode(MLProcess):


 def get_column_info(df: pd.DataFrame) -> dict:
-    data = []
-    for i in df.columns:
-        nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
-        n_unique = df[i].nunique()
-        data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "")
-        if data_type == "O":
-            data_type = "object"
-        data.append([i, data_type, nan_freq, n_unique])
+    column_info = {
+        "Category": [],
+        "Numeric": [],
+        "Datetime": [],
+        "Others": [],
+    }
+    for col in df.columns:
+        data_type = str(df[col].dtype).replace("dtype('", "").replace("')", "")
+        if data_type.startswith("object"):
+            column_info["Category"].append(col)
+        elif data_type.startswith("int") or data_type.startswith("float"):
+            column_info["Numeric"].append(col)
+        elif data_type.startswith("datetime"):
+            column_info["Datetime"].append(col)
+        else:
+            column_info["Others"].append(col)

-    samples = pd.DataFrame(
-        data,
-        columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
-    )
-    return samples.to_dict(orient='list')
+    if len(json.dumps(column_info)) > 2000:
+        column_info['Numeric'] = column_info['Numeric'][0:5] + ['Too many cols, omission here...']
+    return column_info
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@ -6,12 +6,12 @@
 # @Desc    : Feature Engineering Tools
 import itertools

+import lightgbm as lgb
 import numpy as np
 import pandas as pd
-from dateutil.relativedelta import relativedelta
 from joblib import Parallel, delayed
-from pandas.api.types import is_numeric_dtype
 from pandas.core.dtypes.common import is_object_dtype
+from sklearn.feature_selection import VarianceThreshold
 from sklearn.model_selection import KFold
 from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer

@ -19,15 +19,27 @@ from metagpt.tools.functions.libs.base import MLProcess


 class PolynomialExpansion(MLProcess):
-    def __init__(self, cols: list, degree: int = 2):
+    def __init__(self, cols: list, degree: int = 2, label_col: str = None):
        self.cols = cols
        self.degree = degree
+        self.label_col = label_col
+        if self.label_col in self.cols:
+            self.cols.remove(self.label_col)
        self.poly = PolynomialFeatures(degree=degree, include_bias=False)

    def fit(self, df: pd.DataFrame):
+        if len(self.cols) == 0:
+            return
+        if len(self.cols) > 10:
+            corr = df[self.cols + [self.label_col]].corr()
+            corr = corr[self.label_col].abs().sort_values(ascending=False)
+            self.cols = corr.index.tolist()[1:11]
+
        self.poly.fit(df[self.cols].fillna(0))

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        if len(self.cols) == 0:
+            return df
        ts_data = self.poly.transform(df[self.cols].fillna(0))
        column_name = self.poly.get_feature_names_out(self.cols)
        ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
@ -158,27 +170,35 @@ class SplitBins(MLProcess):
        df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
        return df

-# @registry.register("feature_engineering", ExtractTimeComps)
-# def extract_time_comps(df, time_col, time_comps):
-#     time_s = pd.to_datetime(df[time_col], errors="coerce")
-#     time_comps_df = pd.DataFrame()
-#
-#     if "year" in time_comps:
-#         time_comps_df["year"] = time_s.dt.year
-#     if "month" in time_comps:
-#         time_comps_df["month"] = time_s.dt.month
-#     if "day" in time_comps:
-#         time_comps_df["day"] = time_s.dt.day
-#     if "hour" in time_comps:
-#         time_comps_df["hour"] = time_s.dt.hour
-#     if "dayofweek" in time_comps:
-#         time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
-#     if "is_weekend" in time_comps:
-#         time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
-#     df = pd.concat([df, time_comps_df], axis=1)
-#     return df
-#
-#
+
+class ExtractTimeComps(MLProcess):
+    def __init__(self, time_col: str, time_comps: list):
+        self.time_col = time_col
+        self.time_comps = time_comps
+
+    def fit(self, df: pd.DataFrame):
+        pass
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        time_s = pd.to_datetime(df[self.time_col], errors="coerce")
+        time_comps_df = pd.DataFrame()
+
+        if "year" in self.time_comps:
+            time_comps_df["year"] = time_s.dt.year
+        if "month" in self.time_comps:
+            time_comps_df["month"] = time_s.dt.month
+        if "day" in self.time_comps:
+            time_comps_df["day"] = time_s.dt.day
+        if "hour" in self.time_comps:
+            time_comps_df["hour"] = time_s.dt.hour
+        if "dayofweek" in self.time_comps:
+            time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
+        if "is_weekend" in self.time_comps:
+            time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
+        df = pd.concat([df, time_comps_df], axis=1)
+        return df
+
+
 # @registry.register("feature_engineering", FeShiftByTime)
 # def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
 #     df[time_col] = pd.to_datetime(df[time_col])
@ -290,3 +310,66 @@ class GeneralSelection(MLProcess):
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df[self.feats + [self.label_col]]
        return df
+
+
+class TreeBasedSelection(MLProcess):
+    def __init__(self, label_col: str, task_type: str):
+        self.label_col = label_col
+        self.task_type = task_type
+        self.feats = None
+
+    def fit(self, df: pd.DataFrame):
+        params = {
+            'boosting_type': 'gbdt',
+            'objective': 'binary',
+            'learning_rate': 0.1,
+            'num_leaves': 31,
+        }
+
+        if self.task_type == "cls":
+            params["objective"] = "binary"
+            params["metric"] = "auc"
+        elif self.task_type == "mcls":
+            params["objective"] = "multiclass"
+            params["num_class"] = df[self.label_col].nunique()
+            params["metric"] = "auc_mu"
+        elif self.task_type == "reg":
+            params["objective"] = "regression"
+            params["metric"] = "rmse"
+
+        num_cols = df.select_dtypes(include=np.number).columns.tolist()
+        cols = [f for f in num_cols if f not in [self.label_col]]
+
+        dtrain = lgb.Dataset(df[cols], df[self.label_col])
+        model = lgb.train(params, dtrain, num_boost_round=100)
+        df_imp = pd.DataFrame({'feature_name': dtrain.feature_name,
+                               'importance': model.feature_importance("gain")})
+
+        df_imp.sort_values("importance", ascending=False, inplace=True)
+        df_imp = df_imp[df_imp["importance"] > 0]
+        self.feats = df_imp['feature_name'].tolist()
+        self.feats.append(self.label_col)
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df[self.feats]
+        return df
+
+
+class VarianceBasedSelection(MLProcess):
+    def __init__(self, label_col: str, threshold: float = 0):
+        self.label_col = label_col
+        self.threshold = threshold
+        self.feats = None
+        self.selector = VarianceThreshold(threshold=self.threshold)
+
+    def fit(self, df: pd.DataFrame):
+        num_cols = df.select_dtypes(include=np.number).columns.tolist()
+        cols = [f for f in num_cols if f not in [self.label_col]]
+
+        self.selector.fit(df[cols])
+        self.feats = df[cols].columns[self.selector.get_support(indices=True)].tolist()
+        self.feats.append(self.label_col)
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df[self.feats]
+        return df
--- a/metagpt/tools/functions/schemas/data_preprocess.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess.yml
@ -11,7 +11,7 @@ FillMissingValue:
            description: "columns to be processed"
          strategy:
            type: str
-            description: "the imputation strategy"
+            description: "the imputation strategy, notice mean/median can only be used for numeric features"
            default: mean
            enum:
              - mean
--- a/metagpt/tools/functions/schemas/feature_engineering.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering.yml
@ -1,6 +1,6 @@
 PolynomialExpansion:
  type: class
-  description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column."
+  description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
  methods:
    __init__:
      description: "Initialize self."
@ -9,12 +9,16 @@ PolynomialExpansion:
          cols:
            type: list
            description: "Columns for polynomial expansion."
+          label_col:
+            type: str
+            description: "Label column name."
          degree:
            type: int
            description: "The degree of the polynomial features."
            default: 2
        required:
          - cols
+          - label_col
    fit:
      description: "Fit the PolynomialExpansion model."
      parameters:
@ -36,14 +40,14 @@ PolynomialExpansion:
      returns:
        df:
          type: DataFrame
-          description: "The transformed DataFrame."
+          description: "The transformed DataFrame without duplicated columns."
    fit_transform:
      description: "Fit and transform the input DataFrame."
      parameters:
        properties:
          df:
            type: DataFrame
-            description: "The input DataFrame."
+            description: "The input DataFrame without duplicated columns."
        required:
          - df
      returns:
@ -224,7 +228,7 @@ CatCross:
        properties:
          cols:
            type: list
-            description: "Columns to be pairwise crossed."
+            description: "Columns to be pairwise crossed, at least 2 columns."
          max_cat_num:
            type: int
            description: "Maximum unique categories per crossed feature."
@ -430,4 +434,115 @@ GeneralSelection:
      returns:
        df:
          type: DataFrame
-          description: "The transformed DataFrame."
+          description: "The transformed DataFrame."
+
+
+TreeBasedSelection:
+  type: class
+  description: "Select features based on tree-based model and remove features with low importance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          task_type:
+            type: str
+            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
+            enum:
+              - cls
+              - mcls
+              - reg
+        required:
+          - label_col
+          - task_type
+    fit:
+      description: "Fit the TreeBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+
+VarianceBasedSelection:
+  type: class
+  description: "Select features based on variance and remove features with low variance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          threshold:
+            type: float
+            description: "Threshold for variance."
+            default: 0.0
+        required:
+          - label_col
+    fit:
+      description: "Fit the VarianceBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."