diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py
index 8c70462ee..f1665b405 100644
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@@ -1,3 +1,5 @@
+import json
+
 import numpy as np
 import pandas as pd
 from sklearn.impute import SimpleImputer
@@ -20,10 +22,14 @@ class FillMissingValue(MLProcess):
         self.si = None
 
     def fit(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return
         self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
         self.si.fit(df[self.features])
 
     def transform(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return df
         df[self.features] = self.si.transform(df[self.features])
         return df
 
@@ -122,11 +128,15 @@ class LabelEncode(MLProcess):
         self.le_encoders = []
 
     def fit(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return
         for col in self.features:
             le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
             self.le_encoders.append(le)
 
     def transform(self, df: pd.DataFrame):
+        if len(self.features) == 0:
+            return df
         for i in range(len(self.features)):
             data_list = df[self.features[i]].astype(str).tolist()
             for unique_item in np.unique(df[self.features[i]].astype(str)):
@@ -137,17 +147,23 @@ class LabelEncode(MLProcess):
 
 
 def get_column_info(df: pd.DataFrame) -> dict:
-    data = []
-    for i in df.columns:
-        nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
-        n_unique = df[i].nunique()
-        data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "")
-        if data_type == "O":
-            data_type = "object"
-        data.append([i, data_type, nan_freq, n_unique])
+    column_info = {
+        "Category": [],
+        "Numeric": [],
+        "Datetime": [],
+        "Others": [],
+    }
+    for col in df.columns:
+        data_type = str(df[col].dtype).replace("dtype('", "").replace("')", "")
+        if data_type.startswith("object"):
+            column_info["Category"].append(col)
+        elif data_type.startswith("int") or data_type.startswith("float"):
+            column_info["Numeric"].append(col)
+        elif data_type.startswith("datetime"):
+            column_info["Datetime"].append(col)
+        else:
+            column_info["Others"].append(col)
 
-    samples = pd.DataFrame(
-        data,
-        columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
-    )
-    return samples.to_dict(orient='list')
+    if len(json.dumps(column_info)) > 2000:
+        column_info['Numeric'] = column_info['Numeric'][0:5] + ['Too many cols, omission here...']
+    return column_info
diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py
index 1ec2b9675..df36752b9 100644
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@@ -6,12 +6,12 @@
 # @Desc    : Feature Engineering Tools
 import itertools
 
+import lightgbm as lgb
 import numpy as np
 import pandas as pd
-from dateutil.relativedelta import relativedelta
 from joblib import Parallel, delayed
-from pandas.api.types import is_numeric_dtype
 from pandas.core.dtypes.common import is_object_dtype
+from sklearn.feature_selection import VarianceThreshold
 from sklearn.model_selection import KFold
 from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
 
@@ -19,15 +19,27 @@ from metagpt.tools.functions.libs.base import MLProcess
 
 
 class PolynomialExpansion(MLProcess):
-    def __init__(self, cols: list, degree: int = 2):
+    def __init__(self, cols: list, degree: int = 2, label_col: str = None):
         self.cols = cols
         self.degree = degree
+        self.label_col = label_col
+        if self.label_col in self.cols:
+            self.cols.remove(self.label_col)
         self.poly = PolynomialFeatures(degree=degree, include_bias=False)
 
     def fit(self, df: pd.DataFrame):
+        if len(self.cols) == 0:
+            return
+        if len(self.cols) > 10:
+            corr = df[self.cols + [self.label_col]].corr()
+            corr = corr[self.label_col].abs().sort_values(ascending=False)
+            self.cols = corr.index.tolist()[1:11]
+
         self.poly.fit(df[self.cols].fillna(0))
 
     def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        if len(self.cols) == 0:
+            return df
         ts_data = self.poly.transform(df[self.cols].fillna(0))
         column_name = self.poly.get_feature_names_out(self.cols)
         ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
@@ -158,27 +170,35 @@ class SplitBins(MLProcess):
         df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
         return df
 
-# @registry.register("feature_engineering", ExtractTimeComps)
-# def extract_time_comps(df, time_col, time_comps):
-#     time_s = pd.to_datetime(df[time_col], errors="coerce")
-#     time_comps_df = pd.DataFrame()
-#
-#     if "year" in time_comps:
-#         time_comps_df["year"] = time_s.dt.year
-#     if "month" in time_comps:
-#         time_comps_df["month"] = time_s.dt.month
-#     if "day" in time_comps:
-#         time_comps_df["day"] = time_s.dt.day
-#     if "hour" in time_comps:
-#         time_comps_df["hour"] = time_s.dt.hour
-#     if "dayofweek" in time_comps:
-#         time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
-#     if "is_weekend" in time_comps:
-#         time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
-#     df = pd.concat([df, time_comps_df], axis=1)
-#     return df
-#
-#
+
+class ExtractTimeComps(MLProcess):
+    def __init__(self, time_col: str, time_comps: list):
+        self.time_col = time_col
+        self.time_comps = time_comps
+
+    def fit(self, df: pd.DataFrame):
+        pass
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        time_s = pd.to_datetime(df[self.time_col], errors="coerce")
+        time_comps_df = pd.DataFrame()
+
+        if "year" in self.time_comps:
+            time_comps_df["year"] = time_s.dt.year
+        if "month" in self.time_comps:
+            time_comps_df["month"] = time_s.dt.month
+        if "day" in self.time_comps:
+            time_comps_df["day"] = time_s.dt.day
+        if "hour" in self.time_comps:
+            time_comps_df["hour"] = time_s.dt.hour
+        if "dayofweek" in self.time_comps:
+            time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
+        if "is_weekend" in self.time_comps:
+            time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
+        df = pd.concat([df, time_comps_df], axis=1)
+        return df
+
+
 # @registry.register("feature_engineering", FeShiftByTime)
 # def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
 #     df[time_col] = pd.to_datetime(df[time_col])
@@ -290,3 +310,66 @@ class GeneralSelection(MLProcess):
     def transform(self, df: pd.DataFrame) -> pd.DataFrame:
         df = df[self.feats + [self.label_col]]
         return df
+
+
+class TreeBasedSelection(MLProcess):
+    def __init__(self, label_col: str, task_type: str):
+        self.label_col = label_col
+        self.task_type = task_type
+        self.feats = None
+
+    def fit(self, df: pd.DataFrame):
+        params = {
+            'boosting_type': 'gbdt',
+            'objective': 'binary',
+            'learning_rate': 0.1,
+            'num_leaves': 31,
+        }
+
+        if self.task_type == "cls":
+            params["objective"] = "binary"
+            params["metric"] = "auc"
+        elif self.task_type == "mcls":
+            params["objective"] = "multiclass"
+            params["num_class"] = df[self.label_col].nunique()
+            params["metric"] = "auc_mu"
+        elif self.task_type == "reg":
+            params["objective"] = "regression"
+            params["metric"] = "rmse"
+
+        num_cols = df.select_dtypes(include=np.number).columns.tolist()
+        cols = [f for f in num_cols if f not in [self.label_col]]
+
+        dtrain = lgb.Dataset(df[cols], df[self.label_col])
+        model = lgb.train(params, dtrain, num_boost_round=100)
+        df_imp = pd.DataFrame({'feature_name': dtrain.feature_name,
+                               'importance': model.feature_importance("gain")})
+
+        df_imp.sort_values("importance", ascending=False, inplace=True)
+        df_imp = df_imp[df_imp["importance"] > 0]
+        self.feats = df_imp['feature_name'].tolist()
+        self.feats.append(self.label_col)
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df[self.feats]
+        return df
+
+
+class VarianceBasedSelection(MLProcess):
+    def __init__(self, label_col: str, threshold: float = 0):
+        self.label_col = label_col
+        self.threshold = threshold
+        self.feats = None
+        self.selector = VarianceThreshold(threshold=self.threshold)
+
+    def fit(self, df: pd.DataFrame):
+        num_cols = df.select_dtypes(include=np.number).columns.tolist()
+        cols = [f for f in num_cols if f not in [self.label_col]]
+
+        self.selector.fit(df[cols])
+        self.feats = df[cols].columns[self.selector.get_support(indices=True)].tolist()
+        self.feats.append(self.label_col)
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df[self.feats]
+        return df
diff --git a/metagpt/tools/functions/schemas/data_preprocess.yml b/metagpt/tools/functions/schemas/data_preprocess.yml
index 95b0124cc..4de697abd 100644
--- a/metagpt/tools/functions/schemas/data_preprocess.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess.yml
@@ -11,7 +11,7 @@ FillMissingValue:
             description: "columns to be processed"
           strategy:
             type: str
-            description: "the imputation strategy"
+            description: "the imputation strategy, notice mean/median can only be used for numeric features"
             default: mean
             enum:
               - mean
diff --git a/metagpt/tools/functions/schemas/feature_engineering.yml b/metagpt/tools/functions/schemas/feature_engineering.yml
index 3ba9e863b..62e6ad5b3 100644
--- a/metagpt/tools/functions/schemas/feature_engineering.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering.yml
@@ -1,6 +1,6 @@
 PolynomialExpansion:
   type: class
-  description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column."
+  description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
   methods:
     __init__:
       description: "Initialize self."
@@ -9,12 +9,16 @@ PolynomialExpansion:
           cols:
             type: list
             description: "Columns for polynomial expansion."
+          label_col:
+            type: str
+            description: "Label column name."
           degree:
             type: int
             description: "The degree of the polynomial features."
             default: 2
         required:
           - cols
+          - label_col
     fit:
       description: "Fit the PolynomialExpansion model."
       parameters:
@@ -36,14 +40,14 @@ PolynomialExpansion:
       returns:
         df:
           type: DataFrame
-          description: "The transformed DataFrame."
+          description: "The transformed DataFrame without duplicated columns."
     fit_transform:
       description: "Fit and transform the input DataFrame."
       parameters:
         properties:
           df:
             type: DataFrame
-            description: "The input DataFrame."
+            description: "The input DataFrame without duplicated columns."
         required:
           - df
       returns:
@@ -224,7 +228,7 @@ CatCross:
         properties:
           cols:
             type: list
-            description: "Columns to be pairwise crossed."
+            description: "Columns to be pairwise crossed, at least 2 columns."
           max_cat_num:
             type: int
             description: "Maximum unique categories per crossed feature."
@@ -430,4 +434,115 @@ GeneralSelection:
       returns:
         df:
           type: DataFrame
-          description: "The transformed DataFrame."
\ No newline at end of file
+          description: "The transformed DataFrame."
+
+
+TreeBasedSelection:
+  type: class
+  description: "Select features based on tree-based model and remove features with low importance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          task_type:
+            type: str
+            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
+            enum:
+              - cls
+              - mcls
+              - reg
+        required:
+          - label_col
+          - task_type
+    fit:
+      description: "Fit the TreeBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+
+VarianceBasedSelection:
+  type: class
+  description: "Select features based on variance and remove features with low variance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          threshold:
+            type: float
+            description: "Threshold for variance."
+            default: 0.0
+        required:
+          - label_col
+    fit:
+      description: "Fit the VarianceBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
\ No newline at end of file