update ml tool from Function to Class

2026-07-20 16:51:07 +02:00 · 2023-12-12 10:56:05 +08:00 · 2023-12-12 10:56:05 +08:00 · 4f0d55656e
commit 4f0d55656e
parent fd31cc065a
3 changed files with 445 additions and 246 deletions
--- a/metagpt/tools/functions/libs/base.py
+++ b/metagpt/tools/functions/libs/base.py
@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2023/12/10 20:12
+# @Author  : lidanyang
+# @File    : base
+# @Desc    :
+class MLProcess(object):
+    def fit(self, df):
+        raise NotImplementedError
+
+    def transform(self, df):
+        raise NotImplementedError
+
+    def fit_transform(self, df):
+        self.fit(df)
+        return self.transform(df)
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@ -1,6 +1,6 @@
 import numpy as np
 from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import KBinsDiscretizer
+from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
 from sklearn.preprocessing import MaxAbsScaler
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.preprocessing import OneHotEncoder
@ -9,31 +9,52 @@ from sklearn.preprocessing import RobustScaler
 from sklearn.preprocessing import StandardScaler

 from metagpt.tools.functions import registry
+from metagpt.tools.functions.libs.base import MLProcess
 from metagpt.tools.functions.schemas.data_preprocess import *


-@registry.register("data_preprocess", FillMissingValue)
-def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
-    df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
-    return df
+class FillMissingValue(MLProcess):
+    def __init__(self, features: list, strategy: str = 'mean', fill_value=None,):
+        self.features = features
+        self.strategy = strategy
+        self.fill_value = fill_value
+        self.si = None
+
+    def fit(self, df: pd.DataFrame):
+        self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
+        self.si.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.si.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", SplitBins)
-def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
-    df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
-    return df
+class MinMaxScale(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.mms = None
+
+    def fit(self, df: pd.DataFrame):
+        self.mms = MinMaxScaler()
+        self.mms.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.mms.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", MinMaxScale)
-def min_max_scale(df: pd.DataFrame, features: list, ):
-    df[features] = MinMaxScaler().fit_transform(df[features])
-    return df
+class StandardScale(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.ss = None

+    def fit(self, df: pd.DataFrame):
+        self.ss = StandardScaler()
+        self.ss.fit(df[self.features])

-@registry.register("data_preprocess", StandardScale)
-def standard_scale(df: pd.DataFrame, features: list, ):
-    df[features] = StandardScaler().fit_transform(df[features])
-    return df
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.ss.transform(df[self.features])
+        return df


@registry.register("data_preprocess", LogTransform)
@ -45,80 +66,145 @@ def log_transform(df: pd.DataFrame, features: list, ):
    return df


-@registry.register("data_preprocess", MaxAbsScale)
-def max_abs_scale(df: pd.DataFrame, features: list, ):
-    df[features] = MaxAbsScaler().fit_transform(df[features])
-    return df
+class MaxAbsScale(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.mas = None
+
+    def fit(self, df: pd.DataFrame):
+        self.mas = MaxAbsScaler()
+        self.mas.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.mas.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", RobustScale)
-def robust_scale(df: pd.DataFrame, features: list, ):
-    df[features] = RobustScaler().fit_transform(df[features])
-    return df
+class RobustScale(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.rs = None
+
+    def fit(self, df: pd.DataFrame):
+        self.rs = RobustScaler()
+        self.rs.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.rs.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", OrdinalEncode)
-def ordinal_encode(df: pd.DataFrame, features: list,):
-    df[features] = OrdinalEncoder().fit_transform(df[features])
-    return df
+class OrdinalEncode(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.oe = None
+
+    def fit(self, df: pd.DataFrame):
+        self.oe = OrdinalEncoder()
+        self.oe.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        df[self.features] = self.oe.transform(df[self.features])
+        return df


-@registry.register("data_preprocess", OneHotEncoding)
-def one_hot_encoding(df, cols):
-    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
-    ts_data = enc.fit_transform(df[cols])
-    new_columns = enc.get_feature_names_out(cols)
-    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
-    df.drop(cols, axis=1, inplace=True)
-    df = pd.concat([df, ts_data], axis=1)
-    return df
+class OneHotEncode(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.ohe = None
+
+    def fit(self, df: pd.DataFrame):
+        self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
+        self.ohe.fit(df[self.features])
+
+    def transform(self, df: pd.DataFrame):
+        ts_data = self.ohe.transform(df[self.features])
+        new_columns = self.ohe.get_feature_names_out(self.features)
+        ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
+        df.drop(self.features, axis=1, inplace=True)
+        df = pd.concat([df, ts_data], axis=1)
+        return df


-if __name__ == '__main__':
-    def run():
-        V = {
-            'a': [-1, 2, 3, 6, 5, 4],
-            'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
-            'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
-            'd': [1, None, 3, None, 5, 4],
-            'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
-            'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
+class LabelEncode(MLProcess):
+    def __init__(self, features: list,):
+        self.features = features
+        self.le_encoders = []

-        }
+    def fit(self, df: pd.DataFrame):
+        for col in self.features:
+            le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
+            self.le_encoders.append(le)

-        df = pd.DataFrame(V)
-        print(df.dtypes)
+    def transform(self, df: pd.DataFrame):
+        for i in range(len(self.features)):
+            data_list = df[self.features[i]].astype(str).tolist()
+            for unique_item in np.unique(df[self.features[i]].astype(str)):
+                if unique_item not in self.le_encoders[i].classes_:
+                    data_list = ['unknown' if x == unique_item else x for x in data_list]
+            df[self.features[i]] = self.le_encoders[i].transform(data_list)
+        return df

-        numeric_features = ['a', 'b', 'd', 'e']
-        numeric_features_wo_miss = ['a', 'b', ]
-        categorial_features = ['c', 'f']

-        df_ = fill_missing_value(df.copy(), numeric_features)
-        print(df_)
-        df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
-        print(df_)
+def get_column_info(df: pd.DataFrame) -> str:
+    data = []
+    for i in df.columns:
+        nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
+        n_unique = df[i].nunique()
+        data.append([i, df[i].dtype, nan_freq, n_unique])

-        df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
-        print(df_)
-
-        # df_ = label_encode(df.copy(), numeric_features + categorial_features, )
-        # print(df_)
-
-        df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
-        print(df_)
-
-        df_ = min_max_scale(df.copy(), numeric_features, )
-        print(df_)
-
-        df_ = standard_scale(df.copy(), numeric_features, )
-        print(df_)
-
-        df_ = log_transform(df.copy(), numeric_features, )
-        print(df_)
-
-        df_ = max_abs_scale(df.copy(), numeric_features, )
-        print(df_)
-
-        df_ = robust_scale(df.copy(), numeric_features, )
-        print(df_)
-    run()
+    samples = pd.DataFrame(
+        data,
+        columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
+    )
+    return samples.to_string(index=False)
+#
+#
+# if __name__ == '__main__':
+#     def run():
+#         V = {
+#             'a': [-1, 2, 3, 6, 5, 4],
+#             'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
+#             'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
+#             'd': [1, None, 3, None, 5, 4],
+#             'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
+#             'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
+#
+#         }
+#
+#         df = pd.DataFrame(V)
+#         print(df.dtypes)
+#
+#         numeric_features = ['a', 'b', 'd', 'e']
+#         numeric_features_wo_miss = ['a', 'b', ]
+#         categorial_features = ['c', 'f']
+#
+#         df_ = fill_missing_value(df.copy(), numeric_features)
+#         print(df_)
+#         df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
+#         print(df_)
+#
+#         df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
+#         print(df_)
+#
+#         # df_ = label_encode(df.copy(), numeric_features + categorial_features, )
+#         # print(df_)
+#
+#         df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
+#         print(df_)
+#
+#         df_ = min_max_scale(df.copy(), numeric_features, )
+#         print(df_)
+#
+#         df_ = standard_scale(df.copy(), numeric_features, )
+#         print(df_)
+#
+#         df_ = log_transform(df.copy(), numeric_features, )
+#         print(df_)
+#
+#         df_ = max_abs_scale(df.copy(), numeric_features, )
+#         print(df_)
+#
+#         df_ = robust_scale(df.copy(), numeric_features, )
+#         print(df_)
+#     run()
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@ -3,188 +3,285 @@
 # @Time    : 2023/11/17 10:33
 # @Author  : lidanyang
 # @File    : feature_engineering.py
-# @Desc    : Feature Engineering Functions
+# @Desc    : Feature Engineering Tools
 import itertools

+import numpy as np
 from dateutil.relativedelta import relativedelta
+from joblib import Parallel, delayed
 from pandas.api.types import is_numeric_dtype
 from sklearn.model_selection import KFold
-from sklearn.preprocessing import PolynomialFeatures
+from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer

-from metagpt.tools.functions import registry
+from metagpt.tools.functions.libs.base import MLProcess
 from metagpt.tools.functions.schemas.feature_engineering import *


-@registry.register("feature_engineering", PolynomialExpansion)
-def polynomial_expansion(df, cols, degree=2):
-    for col in cols:
-        if not is_numeric_dtype(df[col]):
-            raise ValueError(f"Column '{col}' must be numeric.")
+class PolynomialExpansion(MLProcess):
+    def __init__(self, cols: list, degree: int = 2):
+        self.cols = cols
+        self.degree = degree
+        self.poly = PolynomialFeatures(degree=degree, include_bias=False)

-    poly = PolynomialFeatures(degree=degree, include_bias=False)
-    ts_data = poly.fit_transform(df[cols].fillna(0))
-    new_columns = poly.get_feature_names_out(cols)
-    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
-    ts_data = ts_data.drop(cols, axis=1)
-    df = pd.concat([df, ts_data], axis=1)
-    return df
+    def fit(self, df: pd.DataFrame):
+        self.poly.fit(df[self.cols].fillna(0))
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        ts_data = self.poly.transform(df[self.cols].fillna(0))
+        column_name = self.poly.get_feature_names_out(self.cols)
+        ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
+        df.drop(self.cols, axis=1, inplace=True)
+        df = pd.concat([df, ts_data], axis=1)
+        return df


-@registry.register("feature_engineering", FrequencyEncoding)
-def frequency_encoding(df, cols):
-    for col in cols:
-        encoder_dict = df[col].value_counts().to_dict()
-        df[f"{col}_cnt"] = df[col].map(encoder_dict)
-    return df
+class CatCount(MLProcess):
+    def __init__(self, col: str):
+        self.col = col
+        self.encoder_dict = None
+
+    def fit(self, df: pd.DataFrame):
+        self.encoder_dict = df[self.col].value_counts().to_dict()
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[f"{self.col}_cnt"] = df[self.col].map(self.encoder_dict)
+        return df


-@registry.register("feature_engineering", TargetMeanEncoder)
-def target_mean_encoder(df, col, label):
-    encoder_dict = df.groupby(col)[label].mean().to_dict()
-    df[f"{col}_target_mean"] = df[col].map(encoder_dict)
-    return df
+class TargetMeanEncoder(MLProcess):
+    def __init__(self, col: str, label: str):
+        self.col = col
+        self.label = label
+        self.encoder_dict = None
+
+    def fit(self, df: pd.DataFrame):
+        self.encoder_dict = df.groupby(self.col)[self.label].mean().to_dict()
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[f"{self.col}_target_mean"] = df[self.col].map(self.encoder_dict)
+        return df


-@registry.register("feature_engineering", KFoldTargetMeanEncoder)
-def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021):
-    tmp = df.copy()
-    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+class KFoldTargetMeanEncoder(MLProcess):
+    def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
+        self.col = col
+        self.label = label
+        self.n_splits = n_splits
+        self.random_state = random_state
+        self.encoder_dict = None

-    global_mean = tmp[label].mean()
-    col_name = f"{col}_kf_target_mean"
-    for trn_idx, val_idx in kf.split(tmp, tmp[label]):
-        _trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
-        tmp.loc[tmp.index[val_idx], col_name] = _val[col].map(
-            _trn.groupby(col)[label].mean()
-        )
-    tmp[col_name].fillna(global_mean, inplace=True)
-    encoder_dict = tmp.groupby(col)[col_name].mean().to_dict()
-    df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict)
-    return df
+    def fit(self, df: pd.DataFrame):
+        tmp = df.copy()
+        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)

-
-@registry.register("feature_engineering", CatCross)
-def cat_cross(df, cols, max_cat_num=100):
-    for col in cols:
-        if df[col].nunique() > max_cat_num:
-            cols.remove(col)
-
-    for col1, col2 in itertools.combinations(cols, 2):
-        cross_col = f"{col1}_cross_{col2}"
-        crossed = df[col1].astype(str) + "_" + df[col2].astype(str)
-        df[cross_col] = crossed.astype('category').cat.codes
-    return df
-
-
-@registry.register("feature_engineering", GroupStat)
-def group_stat(df, group_col, agg_col, agg_funcs):
-    group_df = df.groupby(group_col)[agg_col].agg(agg_funcs).reset_index()
-    group_df.columns = group_col + [
-        f"{agg_col}_{agg_func}_by_{group_col}" for agg_func in agg_funcs
-    ]
-    df = df.merge(group_df, on=group_col, how="left")
-    return df
-
-
-@registry.register("feature_engineering", ExtractTimeComps)
-def extract_time_comps(df, time_col, time_comps):
-    time_s = pd.to_datetime(df[time_col], errors="coerce")
-    time_comps_df = pd.DataFrame()
-
-    if "year" in time_comps:
-        time_comps_df["year"] = time_s.dt.year
-    if "month" in time_comps:
-        time_comps_df["month"] = time_s.dt.month
-    if "day" in time_comps:
-        time_comps_df["day"] = time_s.dt.day
-    if "hour" in time_comps:
-        time_comps_df["hour"] = time_s.dt.hour
-    if "dayofweek" in time_comps:
-        time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
-    if "is_weekend" in time_comps:
-        time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
-    df = pd.concat([df, time_comps_df], axis=1)
-    return df
-
-
-@registry.register("feature_engineering", FeShiftByTime)
-def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
-    df[time_col] = pd.to_datetime(df[time_col])
-
-    def shift_datetime(date, offset, unit):
-        if unit in ["year", "y", "Y"]:
-            return date + relativedelta(years=offset)
-        elif unit in ["month", "m", "M"]:
-            return date + relativedelta(months=offset)
-        elif unit in ["day", "d", "D"]:
-            return date + relativedelta(days=offset)
-        elif unit in ["week", "w", "W"]:
-            return date + relativedelta(weeks=offset)
-        elif unit in ["hour", "h", "H"]:
-            return date + relativedelta(hours=offset)
-        else:
-            return date
-
-    def shift_by_time_on_key(
-        inner_df, time_col, group_col, shift_col, offset, unit, col_name
-    ):
-        inner_df = inner_df.drop_duplicates()
-        inner_df[time_col] = inner_df[time_col].map(
-            lambda x: shift_datetime(x, offset, unit)
-        )
-        inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
-            shift_col
-        ].mean()
-        inner_df.rename(columns={shift_col: col_name}, inplace=True)
-        return inner_df
-
-    shift_df = df[[time_col, group_col, shift_col]].copy()
-    for period in periods:
-        new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
-        tmp = shift_by_time_on_key(
-            shift_df, time_col, group_col, shift_col, period, freq, new_col_name
-        )
-        df = df.merge(tmp, on=[time_col, group_col], how="left")
-
-    return df
-
-
-@registry.register("feature_engineering", FeRollingByTime)
-def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
-    df[time_col] = pd.to_datetime(df[time_col])
-
-    def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
-        time_freq = {
-            "Y": [365 * offset, "D"],
-            "M": [30 * offset, "D"],
-            "D": [offset, "D"],
-            "W": [7 * offset, "D"],
-            "H": [offset, "h"],
-        }
-
-        if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
-            raise ValueError(f"Invalid agg function: {agg_func}")
-
-        rolling_feat = inner_df.rolling(
-            f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
-        )
-        rolling_feat = getattr(rolling_feat, agg_func)()
-        depth = df.columns.nlevels
-        rolling_feat = rolling_feat.stack(list(range(depth)))
-        rolling_feat.name = col_name
-        return rolling_feat
-
-    rolling_df = df[[time_col, group_col, rolling_col]].copy()
-    for period in periods:
-        for func in agg_funcs:
-            new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
-            tmp = pd.pivot_table(
-                rolling_df,
-                index=time_col,
-                values=rolling_col,
-                columns=group_col,
+        global_mean = tmp[self.label].mean()
+        col_name = f"{self.col}_kf_target_mean"
+        for trn_idx, val_idx in kf.split(tmp, tmp[self.label]):
+            _trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
+            tmp.loc[tmp.index[val_idx], col_name] = _val[self.col].map(
+                _trn.groupby(self.col)[self.label].mean()
            )
-            tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
-            df = df.merge(tmp, on=[time_col, group_col], how="left")
+        tmp[col_name].fillna(global_mean, inplace=True)
+        self.encoder_dict = tmp.groupby(self.col)[col_name].mean().to_dict()

-    return df
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[f"{self.col}_kf_target_mean"] = df[self.col].map(self.encoder_dict)
+        return df
+
+
+class CatCross(MLProcess):
+    def __init__(self, cols: list, max_cat_num: int = 100):
+        self.cols = cols
+        self.max_cat_num = max_cat_num
+        self.combs = []
+        self.combs_map = {}
+
+    @staticmethod
+    def cross_two(comb, df):
+        new_col = f'{comb[0]}_{comb[1]}'
+        new_col_combs = list(itertools.product(df[comb[0]].unique(), df[comb[1]].unique()))
+        ll = list(range(len(new_col_combs)))
+        comb_map = dict(zip(new_col_combs, ll))
+        return new_col, comb_map
+
+    def fit(self, df: pd.DataFrame):
+        for col in self.cols:
+            if df[col].nunique() > self.max_cat_num:
+                self.cols.remove(col)
+        self.combs = list(itertools.combinations(self.cols, 2))
+        res = Parallel(n_jobs=4, require='sharedmem')(
+            delayed(self.cross_two)(comb, df) for comb in self.combs)
+        self.combs_map = dict(res)
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        for comb in self.combs:
+            new_col = f'{comb[0]}_{comb[1]}'
+            _map = self.combs_map[new_col]
+            df[new_col] = pd.Series(zip(df[comb[0]], df[comb[1]])).map(_map)
+            # set the unknown value to a new number
+            df[new_col].fillna(max(_map.values()) + 1, inplace=True)
+            df[new_col] = df[new_col].astype(int)
+        return df
+
+
+class GroupStat(MLProcess):
+    def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
+        self.group_col = group_col
+        self.agg_col = agg_col
+        self.agg_funcs = agg_funcs
+        self.group_df = None
+
+    def fit(self, df: pd.DataFrame):
+        group_df = df.groupby(self.group_col)[self.agg_col].agg(self.agg_funcs).reset_index()
+        group_df.columns = [self.group_col] + [
+            f"{self.agg_col}_{agg_func}_by_{self.group_col}" for agg_func in self.agg_funcs
+        ]
+        self.group_df = group_df
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df.merge(self.group_df, on=self.group_col, how="left")
+        return df
+
+
+class SplitBins(MLProcess):
+    def __init__(self, cols: str, strategy: str = 'quantile'):
+        self.cols = cols
+        self.strategy = strategy
+        self.encoder = None
+
+    def fit(self, df: pd.DataFrame):
+        self.encoder = KBinsDiscretizer(strategy=self.strategy, encode='ordinal')
+        self.encoder.fit(df[self.cols].fillna(0))
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
+        return df
+
+# @registry.register("feature_engineering", ExtractTimeComps)
+# def extract_time_comps(df, time_col, time_comps):
+#     time_s = pd.to_datetime(df[time_col], errors="coerce")
+#     time_comps_df = pd.DataFrame()
+#
+#     if "year" in time_comps:
+#         time_comps_df["year"] = time_s.dt.year
+#     if "month" in time_comps:
+#         time_comps_df["month"] = time_s.dt.month
+#     if "day" in time_comps:
+#         time_comps_df["day"] = time_s.dt.day
+#     if "hour" in time_comps:
+#         time_comps_df["hour"] = time_s.dt.hour
+#     if "dayofweek" in time_comps:
+#         time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
+#     if "is_weekend" in time_comps:
+#         time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
+#     df = pd.concat([df, time_comps_df], axis=1)
+#     return df
+#
+#
+# @registry.register("feature_engineering", FeShiftByTime)
+# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
+#     df[time_col] = pd.to_datetime(df[time_col])
+#
+#     def shift_datetime(date, offset, unit):
+#         if unit in ["year", "y", "Y"]:
+#             return date + relativedelta(years=offset)
+#         elif unit in ["month", "m", "M"]:
+#             return date + relativedelta(months=offset)
+#         elif unit in ["day", "d", "D"]:
+#             return date + relativedelta(days=offset)
+#         elif unit in ["week", "w", "W"]:
+#             return date + relativedelta(weeks=offset)
+#         elif unit in ["hour", "h", "H"]:
+#             return date + relativedelta(hours=offset)
+#         else:
+#             return date
+#
+#     def shift_by_time_on_key(
+#         inner_df, time_col, group_col, shift_col, offset, unit, col_name
+#     ):
+#         inner_df = inner_df.drop_duplicates()
+#         inner_df[time_col] = inner_df[time_col].map(
+#             lambda x: shift_datetime(x, offset, unit)
+#         )
+#         inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
+#             shift_col
+#         ].mean()
+#         inner_df.rename(columns={shift_col: col_name}, inplace=True)
+#         return inner_df
+#
+#     shift_df = df[[time_col, group_col, shift_col]].copy()
+#     for period in periods:
+#         new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
+#         tmp = shift_by_time_on_key(
+#             shift_df, time_col, group_col, shift_col, period, freq, new_col_name
+#         )
+#         df = df.merge(tmp, on=[time_col, group_col], how="left")
+#
+#     return df
+#
+#
+# @registry.register("feature_engineering", FeRollingByTime)
+# def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
+#     df[time_col] = pd.to_datetime(df[time_col])
+#
+#     def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
+#         time_freq = {
+#             "Y": [365 * offset, "D"],
+#             "M": [30 * offset, "D"],
+#             "D": [offset, "D"],
+#             "W": [7 * offset, "D"],
+#             "H": [offset, "h"],
+#         }
+#
+#         if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
+#             raise ValueError(f"Invalid agg function: {agg_func}")
+#
+#         rolling_feat = inner_df.rolling(
+#             f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
+#         )
+#         rolling_feat = getattr(rolling_feat, agg_func)()
+#         depth = df.columns.nlevels
+#         rolling_feat = rolling_feat.stack(list(range(depth)))
+#         rolling_feat.name = col_name
+#         return rolling_feat
+#
+#     rolling_df = df[[time_col, group_col, rolling_col]].copy()
+#     for period in periods:
+#         for func in agg_funcs:
+#             new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
+#             tmp = pd.pivot_table(
+#                 rolling_df,
+#                 index=time_col,
+#                 values=rolling_col,
+#                 columns=group_col,
+#             )
+#             tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
+#             df = df.merge(tmp, on=[time_col, group_col], how="left")
+#
+#     return df
+
+
+class GeneralSelection(MLProcess):
+    def __init__(self, label_col: str):
+        self.label_col = label_col
+        self.feats = []
+
+    def fit(self, df: pd.DataFrame):
+        feats = [f for f in df.columns if f != self.label_col]
+        for col in df.columns:
+            if df[col].isnull().sum() / df.shape[0] == 1:
+                feats.remove(col)
+
+            if df[col].nunique() == 1:
+                feats.remove(col)
+
+            if (
+                df.loc[df[col] == np.inf].shape[0] != 0
+                or df.loc[df[col] == np.inf].shape[0] != 0
+            ):
+                feats.remove(col)
+        self.feats = feats
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df[self.feats]
+        return df