Merge branch 'dev' into dev_make_tools

2026-06-23 15:48:11 +02:00 · 2023-12-18 22:22:38 +08:00 · 2023-12-18 22:22:38 +08:00 · ea7e11665d
commit ea7e11665d
parent 79787e8129 0701178625
24 changed files with 1801 additions and 1226 deletions
--- a/metagpt/tools/functions/init.py
+++ b/metagpt/tools/functions/init.py
@ -4,5 +4,3 @@
 # @Author  : lidanyang
 # @File    : __init__.py
 # @Desc    :
-from metagpt.tools.functions.register.register import registry
-import metagpt.tools.functions.libs.feature_engineering
--- a/metagpt/tools/functions/libs/base.py
+++ b/metagpt/tools/functions/libs/base.py
@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2023/12/10 20:12
+# @Author  : lidanyang
+# @File    : base
+# @Desc    :
+class MLProcess(object):
+    def fit(self, df):
+        raise NotImplementedError
+
+    def transform(self, df):
+        raise NotImplementedError
+
+    def fit_transform(self, df):
+        self.fit(df)
+        return self.transform(df)
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@ -1,123 +1,153 @@
-
-import pandas as pd
 import numpy as np
-
+import pandas as pd
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import LabelEncoder
-from sklearn.preprocessing import KBinsDiscretizer
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MaxAbsScaler
-from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
+from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import StandardScaler

-from metagpt.tools.functions import registry
-from metagpt.tools.functions.schemas.data_preprocess import *
+from metagpt.tools.functions.libs.base import MLProcess


-@registry.register("data_preprocess", FillMissingValue)
-def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
-    df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
-    return df
+class FillMissingValue(MLProcess):
+    def __init__(self, features: list, strategy: str = 'mean', fill_value=None,):
+        self.features = features
+        self.strategy = strategy
+        self.fill_value = fill_value
+        self.si = None
+
+    def fit(self, df: pd.DataFrame):
+        self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
+        self.si.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.si.transform(df[self.features])
+        return df


-# @registry.register("data_preprocess", FillMissingValue)
-# def label_encode(df: pd.DataFrame, features: list,):
-#     for col in features:
-#         df[col] = LabelEncoder().fit_transform(df[col])
-#     return df
+class MinMaxScale(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.mms = None
+
+    def fit(self, df: pd.DataFrame):
+        self.mms = MinMaxScaler()
+        self.mms.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.mms.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", SplitBins)
-def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
-    df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
-    return df
+class StandardScale(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.ss = None
+
+    def fit(self, df: pd.DataFrame):
+        self.ss = StandardScaler()
+        self.ss.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.ss.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", MinMaxScale)
-def min_max_scale(df: pd.DataFrame, features: list, ):
-    df[features] = MinMaxScaler().fit_transform(df[features])
-    return df
+class MaxAbsScale(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.mas = None
+
+    def fit(self, df: pd.DataFrame):
+        self.mas = MaxAbsScaler()
+        self.mas.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.mas.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", StandardScale)
-def standard_scale(df: pd.DataFrame, features: list, ):
-    df[features] = StandardScaler().fit_transform(df[features])
-    return df
+class RobustScale(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.rs = None
+
+    def fit(self, df: pd.DataFrame):
+        self.rs = RobustScaler()
+        self.rs.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.rs.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", LogTransform)
-def log_transform(df: pd.DataFrame, features: list, ):
-    for col in features:
-        if df[col].min() <= 0:
-            df[col] = df[col] - df[col].min() + 2
-        df[col] = np.log(df[col])
-    return df
+class OrdinalEncode(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.oe = None
+
+    def fit(self, df: pd.DataFrame):
+        self.oe = OrdinalEncoder()
+        self.oe.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.oe.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", MaxAbsScale)
-def max_abs_scale(df: pd.DataFrame, features: list, ):
-    df[features] = MaxAbsScaler().fit_transform(df[features])
-    return df
+class OneHotEncode(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.ohe = None
+
+    def fit(self, df: pd.DataFrame):
+        self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
+        self.ohe.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        ts_data = self.ohe.transform(df[self.features])
+        new_columns = self.ohe.get_feature_names_out(self.features)
+        ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
+        df.drop(self.features, axis=1, inplace=True)
+        df = pd.concat([df, ts_data], axis=1)
+        return df


-@registry.register("data_preprocess", RobustScale)
-def robust_scale(df: pd.DataFrame, features: list, ):
-    df[features] = RobustScaler().fit_transform(df[features])
-    return df
+class LabelEncode(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.le_encoders = []
+
+    def fit(self, df: pd.DataFrame):
+        for col in self.features:
+            le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
+            self.le_encoders.append(le)
+
+    def transform(self, df: pd.DataFrame):
+        for i in range(len(self.features)):
+            data_list = df[self.features[i]].astype(str).tolist()
+            for unique_item in np.unique(df[self.features[i]].astype(str)):
+                if unique_item not in self.le_encoders[i].classes_:
+                    data_list = ['unknown' if x == unique_item else x for x in data_list]
+            df[self.features[i]] = self.le_encoders[i].transform(data_list)
+        return df


-@registry.register("data_preprocess", OrdinalEncode)
-def ordinal_encode(df: pd.DataFrame, features: list,):
-    df[features] = OrdinalEncoder().fit_transform(df[features])
-    return df
+def get_column_info(df: pd.DataFrame) -> dict:
+    data = []
+    for i in df.columns:
+        nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
+        n_unique = df[i].nunique()
+        data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "")
+        if data_type == "O":
+            data_type = "object"
+        data.append([i, data_type, nan_freq, n_unique])

-
-if __name__ == '__main__':
-    def run():
-        V = {
-            'a': [-1, 2, 3, 6, 5, 4],
-            'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
-            'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
-            'd': [1, None, 3, None, 5, 4],
-            'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
-            'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
-
-        }
-
-        df = pd.DataFrame(V)
-        print(df.dtypes)
-
-        numeric_features = ['a', 'b', 'd', 'e']
-        numeric_features_wo_miss = ['a', 'b', ]
-        categorial_features = ['c', 'f']
-
-        df_ = fill_missing_value(df.copy(), numeric_features)
-        print(df_)
-        df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
-        print(df_)
-
-        df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
-        print(df_)
-
-        # df_ = label_encode(df.copy(), numeric_features + categorial_features, )
-        # print(df_)
-
-        df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
-        print(df_)
-
-        df_ = min_max_scale(df.copy(), numeric_features, )
-        print(df_)
-
-        df_ = standard_scale(df.copy(), numeric_features, )
-        print(df_)
-
-        df_ = log_transform(df.copy(), numeric_features, )
-        print(df_)
-
-        df_ = max_abs_scale(df.copy(), numeric_features, )
-        print(df_)
-
-        df_ = robust_scale(df.copy(), numeric_features, )
-        print(df_)
-    run()
+    samples = pd.DataFrame(
+        data,
+        columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
+    )
+    return samples.to_dict(orient='list')
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@ -3,172 +3,290 @@
 # @Time    : 2023/11/17 10:33
 # @Author  : lidanyang
 # @File    : feature_engineering.py
-# @Desc    : Feature Engineering Functions
+# @Desc    : Feature Engineering Tools
 import itertools

+import numpy as np
+import pandas as pd
 from dateutil.relativedelta import relativedelta
+from joblib import Parallel, delayed
 from pandas.api.types import is_numeric_dtype
-from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
+from pandas.core.dtypes.common import is_object_dtype
+from sklearn.model_selection import KFold
+from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer

-from metagpt.tools.functions import registry
-from metagpt.tools.functions.schemas.feature_engineering import *
+from metagpt.tools.functions.libs.base import MLProcess


-@registry.register("feature_engineering", PolynomialExpansion)
-def polynomial_expansion(df, cols, degree=2):
-    for col in cols:
-        if not is_numeric_dtype(df[col]):
-            raise ValueError(f"Column '{col}' must be numeric.")
+class PolynomialExpansion(MLProcess):
+    def __init__(self, cols: list, degree: int = 2):
+        self.cols = cols
+        self.degree = degree
+        self.poly = PolynomialFeatures(degree=degree, include_bias=False)

-    poly = PolynomialFeatures(degree=degree, include_bias=False)
-    ts_data = poly.fit_transform(df[cols].fillna(0))
-    new_columns = poly.get_feature_names_out(cols)
-    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
-    ts_data = ts_data.drop(cols, axis=1)
-    df = pd.concat([df, ts_data], axis=1)
-    return df
+    def fit(self, df: pd.DataFrame):
+        self.poly.fit(df[self.cols].fillna(0))
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        ts_data = self.poly.transform(df[self.cols].fillna(0))
+        column_name = self.poly.get_feature_names_out(self.cols)
+        ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
+        df.drop(self.cols, axis=1, inplace=True)
+        df = pd.concat([df, ts_data], axis=1)
+        return df


-@registry.register("feature_engineering", OneHotEncoding)
-def one_hot_encoding(df, cols):
-    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
-    ts_data = enc.fit_transform(df[cols])
-    new_columns = enc.get_feature_names_out(cols)
-    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
-    df.drop(cols, axis=1, inplace=True)
-    df = pd.concat([df, ts_data], axis=1)
-    return df
+class CatCount(MLProcess):
+    def __init__(self, col: str):
+        self.col = col
+        self.encoder_dict = None
+
+    def fit(self, df: pd.DataFrame):
+        self.encoder_dict = df[self.col].value_counts().to_dict()
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[f"{self.col}_cnt"] = df[self.col].map(self.encoder_dict)
+        return df


-@registry.register("feature_engineering", FrequencyEncoding)
-def frequency_encoding(df, cols):
-    for col in cols:
-        encoder_dict = df[col].value_counts().to_dict()
-        df[f"{col}_cnt"] = df[col].map(encoder_dict)
-    return df
+class TargetMeanEncoder(MLProcess):
+    def __init__(self, col: str, label: str):
+        self.col = col
+        self.label = label
+        self.encoder_dict = None
+
+    def fit(self, df: pd.DataFrame):
+        self.encoder_dict = df.groupby(self.col)[self.label].mean().to_dict()
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[f"{self.col}_target_mean"] = df[self.col].map(self.encoder_dict)
+        return df


-@registry.register("feature_engineering", CatCross)
-def cat_cross(df, cols, max_cat_num=100):
-    for col in cols:
-        if df[col].nunique() > max_cat_num:
-            cols.remove(col)
+class KFoldTargetMeanEncoder(MLProcess):
+    def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
+        self.col = col
+        self.label = label
+        self.n_splits = n_splits
+        self.random_state = random_state
+        self.encoder_dict = None

-    for col1, col2 in itertools.combinations(cols, 2):
-        cross_col = f"{col1}_cross_{col2}"
-        df[cross_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
-    return df
+    def fit(self, df: pd.DataFrame):
+        tmp = df.copy()
+        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)

-
-@registry.register("feature_engineering", GroupStat)
-def group_stat(df, group_col, agg_col, agg_funcs):
-    group_df = df.groupby(group_col)[agg_col].agg(agg_funcs).reset_index()
-    group_df.columns = group_col + [
-        f"{agg_col}_{agg_func}_by_{group_col}" for agg_func in agg_funcs
-    ]
-    df = df.merge(group_df, on=group_col, how="left")
-    return df
-
-
-@registry.register("feature_engineering", ExtractTimeComps)
-def extract_time_comps(df, time_col, time_comps):
-    time_s = pd.to_datetime(df[time_col], errors="coerce")
-    time_comps_df = pd.DataFrame()
-
-    if "year" in time_comps:
-        time_comps_df["year"] = time_s.dt.year
-    if "month" in time_comps:
-        time_comps_df["month"] = time_s.dt.month
-    if "day" in time_comps:
-        time_comps_df["day"] = time_s.dt.day
-    if "hour" in time_comps:
-        time_comps_df["hour"] = time_s.dt.hour
-    if "dayofweek" in time_comps:
-        time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
-    if "is_weekend" in time_comps:
-        time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
-    df = pd.concat([df, time_comps_df], axis=1)
-    return df
-
-
-@registry.register("feature_engineering", FeShiftByTime)
-def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
-    df[time_col] = pd.to_datetime(df[time_col])
-
-    def shift_datetime(date, offset, unit):
-        if unit in ["year", "y", "Y"]:
-            return date + relativedelta(years=offset)
-        elif unit in ["month", "m", "M"]:
-            return date + relativedelta(months=offset)
-        elif unit in ["day", "d", "D"]:
-            return date + relativedelta(days=offset)
-        elif unit in ["week", "w", "W"]:
-            return date + relativedelta(weeks=offset)
-        elif unit in ["hour", "h", "H"]:
-            return date + relativedelta(hours=offset)
-        else:
-            return date
-
-    def shift_by_time_on_key(
-        inner_df, time_col, group_col, shift_col, offset, unit, col_name
-    ):
-        inner_df = inner_df.drop_duplicates()
-        inner_df[time_col] = inner_df[time_col].map(
-            lambda x: shift_datetime(x, offset, unit)
-        )
-        inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
-            shift_col
-        ].mean()
-        inner_df.rename(columns={shift_col: col_name}, inplace=True)
-        return inner_df
-
-    shift_df = df[[time_col, group_col, shift_col]].copy()
-    for period in periods:
-        new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
-        tmp = shift_by_time_on_key(
-            shift_df, time_col, group_col, shift_col, period, freq, new_col_name
-        )
-        df = df.merge(tmp, on=[time_col, group_col], how="left")
-
-    return df
-
-
-@registry.register("feature_engineering", FeRollingByTime)
-def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
-    df[time_col] = pd.to_datetime(df[time_col])
-
-    def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
-        time_freq = {
-            "Y": [365 * offset, "D"],
-            "M": [30 * offset, "D"],
-            "D": [offset, "D"],
-            "W": [7 * offset, "D"],
-            "H": [offset, "h"],
-        }
-
-        if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
-            raise ValueError(f"Invalid agg function: {agg_func}")
-
-        rolling_feat = inner_df.rolling(
-            f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
-        )
-        rolling_feat = getattr(rolling_feat, agg_func)()
-        depth = df.columns.nlevels
-        rolling_feat = rolling_feat.stack(list(range(depth)))
-        rolling_feat.name = col_name
-        return rolling_feat
-
-    rolling_df = df[[time_col, group_col, rolling_col]].copy()
-    for period in periods:
-        for func in agg_funcs:
-            new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
-            tmp = pd.pivot_table(
-                rolling_df,
-                index=time_col,
-                values=rolling_col,
-                columns=group_col,
+        global_mean = tmp[self.label].mean()
+        col_name = f"{self.col}_kf_target_mean"
+        for trn_idx, val_idx in kf.split(tmp, tmp[self.label]):
+            _trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
+            tmp.loc[tmp.index[val_idx], col_name] = _val[self.col].map(
+                _trn.groupby(self.col)[self.label].mean()
            )
-            tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
-            df = df.merge(tmp, on=[time_col, group_col], how="left")
+        tmp[col_name].fillna(global_mean, inplace=True)
+        self.encoder_dict = tmp.groupby(self.col)[col_name].mean().to_dict()

-    return df
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[f"{self.col}_kf_target_mean"] = df[self.col].map(self.encoder_dict)
+        return df
+
+
+class CatCross(MLProcess):
+    def __init__(self, cols: list, max_cat_num: int = 100):
+        self.cols = cols
+        self.max_cat_num = max_cat_num
+        self.combs = []
+        self.combs_map = {}
+
+    @staticmethod
+    def cross_two(comb, df):
+        new_col = f'{comb[0]}_{comb[1]}'
+        new_col_combs = list(itertools.product(df[comb[0]].unique(), df[comb[1]].unique()))
+        ll = list(range(len(new_col_combs)))
+        comb_map = dict(zip(new_col_combs, ll))
+        return new_col, comb_map
+
+    def fit(self, df: pd.DataFrame):
+        for col in self.cols:
+            if df[col].nunique() > self.max_cat_num:
+                self.cols.remove(col)
+        self.combs = list(itertools.combinations(self.cols, 2))
+        res = Parallel(n_jobs=4, require='sharedmem')(
+            delayed(self.cross_two)(comb, df) for comb in self.combs)
+        self.combs_map = dict(res)
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        for comb in self.combs:
+            new_col = f'{comb[0]}_{comb[1]}'
+            _map = self.combs_map[new_col]
+            df[new_col] = pd.Series(zip(df[comb[0]], df[comb[1]])).map(_map)
+            # set the unknown value to a new number
+            df[new_col].fillna(max(_map.values()) + 1, inplace=True)
+            df[new_col] = df[new_col].astype(int)
+        return df
+
+
+class GroupStat(MLProcess):
+    def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
+        self.group_col = group_col
+        self.agg_col = agg_col
+        self.agg_funcs = agg_funcs
+        self.group_df = None
+
+    def fit(self, df: pd.DataFrame):
+        group_df = df.groupby(self.group_col)[self.agg_col].agg(self.agg_funcs).reset_index()
+        group_df.columns = [self.group_col] + [
+            f"{self.agg_col}_{agg_func}_by_{self.group_col}" for agg_func in self.agg_funcs
+        ]
+        self.group_df = group_df
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df.merge(self.group_df, on=self.group_col, how="left")
+        return df
+
+
+class SplitBins(MLProcess):
+    def __init__(self, cols: str, strategy: str = 'quantile'):
+        self.cols = cols
+        self.strategy = strategy
+        self.encoder = None
+
+    def fit(self, df: pd.DataFrame):
+        self.encoder = KBinsDiscretizer(strategy=self.strategy, encode='ordinal')
+        self.encoder.fit(df[self.cols].fillna(0))
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
+        return df
+
+# @registry.register("feature_engineering", ExtractTimeComps)
+# def extract_time_comps(df, time_col, time_comps):
+#     time_s = pd.to_datetime(df[time_col], errors="coerce")
+#     time_comps_df = pd.DataFrame()
+#
+#     if "year" in time_comps:
+#         time_comps_df["year"] = time_s.dt.year
+#     if "month" in time_comps:
+#         time_comps_df["month"] = time_s.dt.month
+#     if "day" in time_comps:
+#         time_comps_df["day"] = time_s.dt.day
+#     if "hour" in time_comps:
+#         time_comps_df["hour"] = time_s.dt.hour
+#     if "dayofweek" in time_comps:
+#         time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
+#     if "is_weekend" in time_comps:
+#         time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
+#     df = pd.concat([df, time_comps_df], axis=1)
+#     return df
+#
+#
+# @registry.register("feature_engineering", FeShiftByTime)
+# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
+#     df[time_col] = pd.to_datetime(df[time_col])
+#
+#     def shift_datetime(date, offset, unit):
+#         if unit in ["year", "y", "Y"]:
+#             return date + relativedelta(years=offset)
+#         elif unit in ["month", "m", "M"]:
+#             return date + relativedelta(months=offset)
+#         elif unit in ["day", "d", "D"]:
+#             return date + relativedelta(days=offset)
+#         elif unit in ["week", "w", "W"]:
+#             return date + relativedelta(weeks=offset)
+#         elif unit in ["hour", "h", "H"]:
+#             return date + relativedelta(hours=offset)
+#         else:
+#             return date
+#
+#     def shift_by_time_on_key(
+#         inner_df, time_col, group_col, shift_col, offset, unit, col_name
+#     ):
+#         inner_df = inner_df.drop_duplicates()
+#         inner_df[time_col] = inner_df[time_col].map(
+#             lambda x: shift_datetime(x, offset, unit)
+#         )
+#         inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
+#             shift_col
+#         ].mean()
+#         inner_df.rename(columns={shift_col: col_name}, inplace=True)
+#         return inner_df
+#
+#     shift_df = df[[time_col, group_col, shift_col]].copy()
+#     for period in periods:
+#         new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
+#         tmp = shift_by_time_on_key(
+#             shift_df, time_col, group_col, shift_col, period, freq, new_col_name
+#         )
+#         df = df.merge(tmp, on=[time_col, group_col], how="left")
+#
+#     return df
+#
+#
+# @registry.register("feature_engineering", FeRollingByTime)
+# def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
+#     df[time_col] = pd.to_datetime(df[time_col])
+#
+#     def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
+#         time_freq = {
+#             "Y": [365 * offset, "D"],
+#             "M": [30 * offset, "D"],
+#             "D": [offset, "D"],
+#             "W": [7 * offset, "D"],
+#             "H": [offset, "h"],
+#         }
+#
+#         if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
+#             raise ValueError(f"Invalid agg function: {agg_func}")
+#
+#         rolling_feat = inner_df.rolling(
+#             f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
+#         )
+#         rolling_feat = getattr(rolling_feat, agg_func)()
+#         depth = df.columns.nlevels
+#         rolling_feat = rolling_feat.stack(list(range(depth)))
+#         rolling_feat.name = col_name
+#         return rolling_feat
+#
+#     rolling_df = df[[time_col, group_col, rolling_col]].copy()
+#     for period in periods:
+#         for func in agg_funcs:
+#             new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
+#             tmp = pd.pivot_table(
+#                 rolling_df,
+#                 index=time_col,
+#                 values=rolling_col,
+#                 columns=group_col,
+#             )
+#             tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
+#             df = df.merge(tmp, on=[time_col, group_col], how="left")
+#
+#     return df
+
+
+class GeneralSelection(MLProcess):
+    def __init__(self, label_col: str):
+        self.label_col = label_col
+        self.feats = []
+
+    def fit(self, df: pd.DataFrame):
+        feats = [f for f in df.columns if f != self.label_col]
+        for col in df.columns:
+            if df[col].isnull().sum() / df.shape[0] == 1:
+                feats.remove(col)
+
+            if df[col].nunique() == 1:
+                feats.remove(col)
+
+            if (
+                df.loc[df[col] == np.inf].shape[0] != 0
+                or df.loc[df[col] == np.inf].shape[0] != 0
+            ):
+                feats.remove(col)
+
+            if is_object_dtype(df[col]) and df[col].nunique() == df.shape[0]:
+                feats.remove(col)
+
+        self.feats = feats
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df[self.feats + [self.label_col]]
+        return df
--- a/metagpt/tools/functions/libs/ml_model.py
+++ b/metagpt/tools/functions/libs/ml_model.py
@ -1,196 +0,0 @@
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder
-
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-
-
-from sklearn.linear_model import LinearRegression
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.ensemble import GradientBoostingRegressor
-
-from metagpt.tools.functions import registry
-from metagpt.tools.functions.schemas.ml_model import *
-
-
-#########
-## 分类 ##
-#########
-
-
-@registry.register("classification_model", LogisticRegressionClassification)
-def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False):
-    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
-    for col in nonnumeric_columns:
-        df[col] = LabelEncoder().fit_transform(df[col])
-    df = df.fillna(0)
-
-    features = [col for col in df if col != label]
-    x, y = df[features], df[label]
-    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
-
-    model = LogisticRegression(penalty=penalty, dual=dual)
-    model.fit(tr_x, tr_y, )
-    te_pred_prob = model.predict_proba(te_x)
-
-    res = {
-        'te_pred_prob': te_pred_prob
-    }
-    return res
-
-
-@registry.register("classification_model", RandomForestClassification)
-def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'):
-    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
-    for col in nonnumeric_columns:
-        df[col] = LabelEncoder().fit_transform(df[col])
-    df = df.fillna(0)
-
-    features = [col for col in df if col != label]
-    x, y = df[features], df[label]
-    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
-    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
-    model.fit(tr_x, tr_y, )
-    te_pred_prob = model.predict_proba(te_x)
-
-    res = {
-        'te_pred_prob': te_pred_prob
-    }
-    return res
-
-
-@registry.register("classification_model", GradientBoostingClassification)
-def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
-    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
-    for col in nonnumeric_columns:
-        df[col] = LabelEncoder().fit_transform(df[col])
-    df = df.fillna(0)
-
-    features = [col for col in df if col != label]
-    x, y = df[features], df[label]
-    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
-    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
-    model.fit(tr_x, tr_y, )
-    te_pred_prob = model.predict_proba(te_x)
-
-    res = {
-        'te_pred_prob': te_pred_prob
-    }
-    return res
-
-
-
-#########
-## 回归 ##
-#########
-
-
-@registry.register("regression_model", LinearRegressionRegression)
-def linear_regression(df, label, test_size=0.2, ):
-    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
-    for col in nonnumeric_columns:
-        df[col] = LabelEncoder().fit_transform(df[col])
-    df = df.fillna(0)
-
-    features = [col for col in df if col != label]
-    x, y = df[features], df[label]
-    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
-
-    model = LinearRegression()
-    model.fit(tr_x, tr_y, )
-    te_pred_prob = model.predict(te_x)
-
-    res = {
-        'te_pred_prob': te_pred_prob
-    }
-    return res
-
-
-@registry.register("regression_model", RandomForestRegression)
-def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'):
-    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
-    for col in nonnumeric_columns:
-        df[col] = LabelEncoder().fit_transform(df[col])
-    df = df.fillna(0)
-
-    features = [col for col in df if col != label]
-    x, y = df[features], df[label]
-    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
-    model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion)
-    model.fit(tr_x, tr_y, )
-    te_pred_prob = model.predict(te_x)
-
-    res = {
-        'te_pred_prob': te_pred_prob
-    }
-    return res
-
-
-@registry.register("regression_model", GradientBoostingRegression)
-def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
-    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
-    for col in nonnumeric_columns:
-        df[col] = LabelEncoder().fit_transform(df[col])
-    df = df.fillna(0)
-
-    features = [col for col in df if col != label]
-    x, y = df[features], df[label]
-    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
-    model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
-    model.fit(tr_x, tr_y, )
-    te_pred_prob = model.predict(te_x)
-
-    res = {
-        'te_pred_prob': te_pred_prob
-    }
-    return res
-
-
-if __name__ == '__main__':
-    def run():
-        from sklearn.datasets import load_iris
-        loader = load_iris(as_frame=True)
-        df = loader['data']
-        df['target'] = loader['target']
-
-        df[df.columns[0]] = df[df.columns[0]].astype(str)
-        df[df.columns[1]] = df[df.columns[1]].astype(int)
-        df['target'] = df['target'].astype(str)
-
-        print(df)
-        print('####'*5)
-        res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False)
-        print(res['te_pred_prob'])
-
-        print('####'*5)
-        res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini')
-        print(res['te_pred_prob'])
-
-        print('####'*5)
-        res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
-        print(res['te_pred_prob'])
-
-        from sklearn.datasets import make_regression
-        import pandas as pd
-        loader = make_regression()
-        df = pd.DataFrame(loader[0])
-        df['target'] = loader[1]
-
-        df[df.columns[0]] = df[df.columns[0]].astype(str)
-        df[df.columns[1]] = df[df.columns[1]].astype(int)
-        # df['target'] = df['target'].astype(str)
-
-        print(df)
-        print('####' * 5)
-        res = linear_regression(df, 'target', test_size=0.25, )
-        print(res['te_pred_prob'])
-
-        print('####' * 5)
-        res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error')
-        print(res['te_pred_prob'])
-
-        print('####' * 5)
-        res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
-        print(res['te_pred_prob'])
-    run()
--- a/metagpt/tools/functions/register/init.py
+++ b/metagpt/tools/functions/register/init.py
@ -1,6 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Time    : 2023/11/16 16:37
-# @Author  : lidanyang
-# @File    : __init__.py
-# @Desc    :
--- a/metagpt/tools/functions/register/register.py
+++ b/metagpt/tools/functions/register/register.py
@ -1,78 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Time    : 2023/11/16 16:38
-# @Author  : lidanyang
-# @File    : register.py
-# @Desc    :
-import inspect
-from typing import Type, Optional, Callable, Dict, Union, List
-
-from metagpt.tools.functions.schemas.base import ToolSchema
-
-
-class FunctionRegistry:
-    def __init__(self):
-        self.functions: Dict[str, Dict[str, Dict]] = {}
-
-    @staticmethod
-    def _check_param_consistency(func_params, schema):
-        param_names = set(func_params.keys())
-        schema_names = set(schema["parameters"]["properties"].keys())
-
-        if param_names != schema_names:
-            raise ValueError("Function parameters do not match schema properties")
-
-    def register(self, module: str, tool_schema: Type[ToolSchema]) -> Callable:
-        def wrapper(func: Callable) -> Callable:
-            module_registry = self.functions.setdefault(module, {})
-
-            if func.__name__ in module_registry:
-                raise ValueError(f"Function {func.__name__} is already registered in {module}")
-
-            func_params = inspect.signature(func).parameters
-
-            schema = tool_schema.schema()
-            schema["name"] = func.__name__
-
-            self._check_param_consistency(func_params, schema)
-
-            module_registry[func.__name__] = {
-                "func": func,
-                "schema": schema,
-            }
-            return func
-
-        return wrapper
-
-    def get(self, module: str, name: str) -> Optional[Union[Callable, Dict]]:
-        """Get function by module and name"""
-        module_registry = self.functions.get(module, {})
-        return module_registry.get(name)
-
-    def get_by_name(self, name: str) -> Optional[Dict]:
-        """Get function by name"""
-        for module_registry in self.functions.values():
-            if name in module_registry:
-                return module_registry.get(name, {})
-
-    def get_all_by_module(self, module: str) -> Optional[Dict]:
-        """Get all functions by module"""
-        return self.functions.get(module, {})
-
-    def get_schema(self, module: str, name: str) -> Optional[Dict]:
-        """Get schema by module and name"""
-        module_registry = self.functions.get(module, {})
-        return module_registry.get(name, {}).get("schema")
-
-    def get_schemas(self, module: str, names: List[str]) -> List[Dict]:
-        """Get schemas by module and names"""
-        module_registry = self.functions.get(module, {})
-        return [module_registry.get(name, {}).get("schema") for name in names]
-
-    def get_all_schema_by_module(self, module: str) -> List[Dict]:
-        """Get all schemas by module"""
-        module_registry = self.functions.get(module, {})
-        return [v.get("schema") for v in module_registry.values()]
-
-
-registry = FunctionRegistry()
--- a/metagpt/tools/functions/schemas/base.py
+++ b/metagpt/tools/functions/schemas/base.py
@ -1,100 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Time    : 2023/11/16 16:34
-# @Author  : lidanyang
-# @File    : base.py
-# @Desc    : Build base class to generate schema for tool
-from typing import Any, List, Optional, get_type_hints
-
-
-class NoDefault:
-    """
-    A class to represent a missing default value.
-
-    This is used to distinguish between a default value of None and a missing default value.
-    """
-    pass
-
-
-def tool_field(
-    description: str, default: Any = NoDefault(), enum: Optional[List[Any]] = None, **kwargs
-):
-    """
-    Create a field for a tool parameter.
-
-    Args:
-        description (str): A description of the field.
-        default (Any, optional): The default value for the field. Defaults to None.
-        enum (Optional[List[Any]], optional): A list of possible values for the field. Defaults to None.
-        **kwargs: Additional keyword arguments.
-
-    Returns:
-        dict: A dictionary representing the field with provided attributes.
-    """
-    field_info = {
-        "description": description,
-        "default": default,
-        "enum": enum,
-    }
-    field_info.update(kwargs)
-    return field_info
-
-
-class ToolSchema:
-    @staticmethod
-    def format_type(type_hint):
-        """
-        Format a type hint into a string representation.
-
-        Args:
-            type_hint (type): The type hint to format.
-
-        Returns:
-            str: A string representation of the type hint.
-        """
-        if isinstance(type_hint, type):
-            # Handle built-in types separately
-            if type_hint.__module__ == "builtins":
-                return type_hint.__name__
-            else:
-                return f"{type_hint.__module__}.{type_hint.__name__}"
-        elif hasattr(type_hint, "__origin__") and hasattr(type_hint, "__args__"):
-            # Handle generic types (like List[int])
-            origin_type = ToolSchema.format_type(type_hint.__origin__)
-            args_type = ", ".join(
-                [ToolSchema.format_type(t) for t in type_hint.__args__]
-            )
-            return f"{origin_type}[{args_type}]"
-        else:
-            return str(type_hint)
-
-    @classmethod
-    def schema(cls):
-        """
-        Generate a schema dictionary for the class.
-
-        The schema includes the class name, description, and information about
-        each class parameter based on type hints and field definitions.
-
-        Returns:
-            dict: A dictionary representing the schema of the class.
-        """
-        schema = {
-            "name": cls.__name__,
-            "description": cls.__doc__,
-            "parameters": {"type": "object", "properties": {}, "required": []},
-        }
-        type_hints = get_type_hints(cls)
-        for attr, type_hint in type_hints.items():
-            value = getattr(cls, attr, None)
-            if isinstance(value, dict):
-                # Process each attribute that is defined using the field function
-                prop_info = {k: v for k, v in value.items() if v is not None or k == "default"}
-                if isinstance(prop_info["default"], NoDefault):
-                    del prop_info["default"]
-                prop_info["type"] = ToolSchema.format_type(type_hint)
-                schema["parameters"]["properties"][attr] = prop_info
-                # Check for required fields
-                if "default" not in prop_info:
-                    schema["parameters"]["required"].append(attr)
-        return schema
--- a/metagpt/tools/functions/schemas/data_preprocess.py
+++ b/metagpt/tools/functions/schemas/data_preprocess.py
@ -1,62 +0,0 @@
-
-import pandas as pd
-
-from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
-
-
-class FillMissingValue(ToolSchema):
-    """Completing missing values with simple strategies"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    features: list = tool_field(description="columns to be processed")
-    strategy: str = tool_field(description="the imputation strategy", default='mean')
-    fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
-
-
-# class LabelEncode(ToolSchema):
-#     """Completing missing values with simple strategies"""
-#     df: pd.DataFrame = tool_field(description="input dataframe")
-#     features: list = tool_field(description="columns to be processed")
-
-
-class SplitBins(ToolSchema):
-    """Bin continuous data into intervals and return the bin identifier encoded as an integer value"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    features: list = tool_field(description="columns to be processed")
-    strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile')
-
-
-class MinMaxScale(ToolSchema):
-    """Transform features by scaling each feature to a range, witch is (0, 1)"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    features: list = tool_field(description="columns to be processed")
-
-
-class StandardScale(ToolSchema):
-    """Standardize features by removing the mean and scaling to unit variance"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    features: list = tool_field(description="columns to be processed")
-
-
-class LogTransform(ToolSchema):
-    """Performs a logarithmic transformation on the specified columns"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    features: list = tool_field(description="columns to be processed")
-
-
-class MaxAbsScale(ToolSchema):
-    """Scale each feature by its maximum absolute value"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    features: list = tool_field(description="columns to be processed")
-
-
-class RobustScale(ToolSchema):
-    """Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    features: list = tool_field(description="columns to be processed")
-
-
-class OrdinalEncode(ToolSchema):
-    """Encode categorical features as an integer array"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    features: list = tool_field(description="columns to be processed")
-
--- a/metagpt/tools/functions/schemas/data_preprocess.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess.yml
@ -0,0 +1,306 @@
+FillMissingValue:
+  type: class
+  description: "Completing missing values with simple strategies"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+          strategy:
+            type: str
+            description: "the imputation strategy"
+            default: mean
+            enum:
+              - mean
+              - median
+              - most_frequent
+              - constant
+          fill_value:
+            type: int
+            description: "fill_value is used to replace all occurrences of missing_values"
+            default: null
+        required:
+          - features
+    fit:
+      description: "Fit the FillMissingValue model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+MinMaxScale:
+  type: class
+  description: "Transform features by scaling each feature to a range, witch is (0, 1)"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the MinMaxScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+StandardScale:
+  type: class
+  description: "Standardize features by removing the mean and scaling to unit variance"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the StandardScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+MaxAbsScale:
+  type: class
+  description: "cale each feature by its maximum absolute value"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the MaxAbsScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+LabelEncode:
+  type: class
+  description: "Apply label encoding to specified categorical columns in-place."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "Categorical columns to be label encoded"
+        required:
+          - features
+    fit:
+      description: "Fit the LabelEncode model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+OneHotEncode:
+  type: class
+  description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "Categorical columns to be one-hot encoded and dropped"
+        required:
+          - features
+    fit:
+      description: "Fit the OneHotEncoding model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/feature_engineering.py
+++ b/metagpt/tools/functions/schemas/feature_engineering.py
@ -1,100 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Time    : 2023/11/17 10:34
-# @Author  : lidanyang
-# @File    : feature_engineering.py
-# @Desc    : Schema for feature engineering functions
-from typing import List
-
-import pandas as pd
-
-from metagpt.tools.functions.schemas.base import ToolSchema, tool_field
-
-
-class PolynomialExpansion(ToolSchema):
-    """Generate polynomial and interaction features from selected columns, excluding the bias column."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    cols: list = tool_field(description="Columns for polynomial expansion.")
-    degree: int = tool_field(description="Degree of polynomial features.", default=2)
-
-
-class OneHotEncoding(ToolSchema):
-    """Apply one-hot encoding to specified categorical columns in a DataFrame."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    cols: list = tool_field(description="Categorical columns to be one-hot encoded.")
-
-
-class FrequencyEncoding(ToolSchema):
-    """Convert categorical columns to frequency encoding."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    cols: list = tool_field(description="Categorical columns to be frequency encoded.")
-
-
-class CatCross(ToolSchema):
-    """Create pairwise crossed features from categorical columns, joining values with '_'."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    cols: list = tool_field(description="Columns to be pairwise crossed.")
-    max_cat_num: int = tool_field(
-        description="Maximum unique categories per crossed feature.", default=100
-    )
-
-
-class GroupStat(ToolSchema):
-    """Perform aggregation operations on a specified column grouped by certain categories."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    group_col: str = tool_field(description="Column used for grouping.")
-    agg_col: str = tool_field(description="Column on which aggregation is performed.")
-    agg_funcs: list = tool_field(
-        description="""List of aggregation functions to apply, such as ['mean', 'std'].
-                    Each function must be supported by pandas."""
-    )
-
-
-class ExtractTimeComps(ToolSchema):
-    """Extract specific time components from a designated time column in a DataFrame."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    time_col: str = tool_field(
-        description="The name of the column containing time data."
-    )
-    time_comps: List[str] = tool_field(
-        description="""List of time components to extract.
-        Each component must be in ['year', 'month', 'day', 'hour', 'dayofweek', 'is_weekend']."""
-    )
-
-
-class FeShiftByTime(ToolSchema):
-    """Shift column values in a DataFrame based on specified time intervals."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    time_col: str = tool_field(description="Column for time-based shifting.")
-    group_col: str = tool_field(description="Column for grouping before shifting.")
-    shift_col: str = tool_field(description="Column to shift.")
-    periods: list = tool_field(description="Time intervals for shifting.")
-    freq: str = tool_field(
-        description="Frequency unit for time intervals (e.g., 'D', 'M').",
-        enum=["D", "M", "Y", "W", "H"],
-    )
-
-
-class FeRollingByTime(ToolSchema):
-    """Calculate rolling statistics for a DataFrame column over time intervals."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    time_col: str = tool_field(description="Column for time-based rolling.")
-    group_col: str = tool_field(description="Column for grouping before rolling.")
-    rolling_col: str = tool_field(description="Column for rolling calculations.")
-    periods: list = tool_field(description="Window sizes for rolling.")
-    freq: str = tool_field(
-        description="Frequency unit for time windows (e.g., 'D', 'M').",
-        enum=["D", "M", "Y", "W", "H"],
-    )
-    agg_funcs: list = tool_field(
-        description="""List of aggregation functions for rolling, like ['mean', 'std'].
-        Each function must be in ['mean', 'std', 'min', 'max', 'median', 'sum', 'count']."""
-    )
--- a/metagpt/tools/functions/schemas/feature_engineering.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering.yml
@ -0,0 +1,433 @@
+PolynomialExpansion:
+  type: class
+  description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns for polynomial expansion."
+          degree:
+            type: int
+            description: "The degree of the polynomial features."
+            default: 2
+        required:
+          - cols
+    fit:
+      description: "Fit the PolynomialExpansion model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+CatCount:
+  type: class
+  description: "Add value counts of a categorical column as new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column for value counts."
+        required:
+          - col
+    fit:
+      description: "Fit the CatCount model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+TargetMeanEncoder:
+  type: class
+  description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the TargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+KFoldTargetMeanEncoder:
+  type: class
+  description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be k-fold mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+          n_splits:
+            type: int
+            description: "Number of splits for K-fold."
+            default: 5
+          random_state:
+            type: int
+            description: "Random seed."
+            default: 2021
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the KFoldTargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+CatCross:
+  type: class
+  description: "Add pairwise crossed features and convert them to numerical features."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be pairwise crossed."
+          max_cat_num:
+            type: int
+            description: "Maximum unique categories per crossed feature."
+            default: 100
+      required:
+        - cols
+    fit:
+      description: "Fit the CatCross model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+GroupStat:
+  type: class
+  description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          group_col:
+            type: str
+            description: "Column used for grouping."
+          agg_col:
+            type: str
+            description: "Column on which aggregation is performed."
+          agg_funcs:
+            type: list
+            description: >-
+              List of aggregation functions to apply, such as ['mean', 'std'].
+              Each function must be supported by pandas.
+        required:
+          - group_col
+          - agg_col
+          - agg_funcs
+    fit:
+      description: "Fit the GroupStat model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+SplitBins:
+  type: class
+  description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be binned inplace."
+          strategy:
+            type: str
+            description: "Strategy used to define the widths of the bins."
+            default: quantile
+            enum:
+              - quantile
+              - uniform
+              - kmeans
+        required:
+          - cols
+    fit:
+      description: "Fit the SplitBins model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+GeneralSelection:
+  type: class
+  description: "Drop all nan feats and feats with only one unique value."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+        required:
+          - label_col
+    fit:
+      description: "Fit the GeneralSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/ml_model.py
+++ b/metagpt/tools/functions/schemas/ml_model.py
@ -1,55 +0,0 @@
-import pandas as pd
-
-from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
-
-
-class LogisticRegressionClassification(ToolSchema):
-    """Logistic Regression (aka logit, MaxEnt) classifier"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    label: str = tool_field(description="target name")
-    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
-    penalty: str = tool_field(description="Specify the norm of the penalty", default="l2")
-    dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2")
-
-
-class RandomForestClassification(ToolSchema):
-    """random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    label: str = tool_field(description="target name")
-    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
-    n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
-    criterion: str = tool_field(description="The function to measure the quality of a split", default="gini")
-
-
-class GradientBoostingClassification(ToolSchema):
-    """Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    label: str = tool_field(description="target name")
-    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
-    n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
-    learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
-
-
-class LinearRegressionRegression(ToolSchema):
-    """Ordinary least squares Linear Regression."""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    label: str = tool_field(description="target name")
-    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
-
-
-class RandomForestRegression(ToolSchema):
-    """random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    label: str = tool_field(description="target name")
-    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
-    n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
-    criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error")
-
-
-class GradientBoostingRegression(ToolSchema):
-    """Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion"""
-    df: pd.DataFrame = tool_field(description="input dataframe")
-    label: str = tool_field(description="target name")
-    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
-    n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
-    learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)