diff --git a/metagpt/tools/functions/libs/base.py b/metagpt/tools/functions/libs/base.py new file mode 100644 index 000000000..c39adc66b --- /dev/null +++ b/metagpt/tools/functions/libs/base.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2023/12/10 20:12 +# @Author : lidanyang +# @File : base +# @Desc : +class MLProcess(object): + def fit(self, df): + raise NotImplementedError + + def transform(self, df): + raise NotImplementedError + + def fit_transform(self, df): + self.fit(df) + return self.transform(df) diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py index 5579c5bd8..39474b0fd 100644 --- a/metagpt/tools/functions/libs/data_preprocess.py +++ b/metagpt/tools/functions/libs/data_preprocess.py @@ -1,6 +1,6 @@ import numpy as np from sklearn.impute import SimpleImputer -from sklearn.preprocessing import KBinsDiscretizer +from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder from sklearn.preprocessing import MaxAbsScaler from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import OneHotEncoder @@ -9,31 +9,52 @@ from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import StandardScaler from metagpt.tools.functions import registry +from metagpt.tools.functions.libs.base import MLProcess from metagpt.tools.functions.schemas.data_preprocess import * -@registry.register("data_preprocess", FillMissingValue) -def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,): - df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features]) - return df +class FillMissingValue(MLProcess): + def __init__(self, features: list, strategy: str = 'mean', fill_value=None,): + self.features = features + self.strategy = strategy + self.fill_value = fill_value + self.si = None + + def fit(self, df: pd.DataFrame): + self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value) + self.si.fit(df[self.features]) + + def transform(self, df: pd.DataFrame): + df[self.features] = self.si.transform(df[self.features]) + return df -@registry.register("data_preprocess", SplitBins) -def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',): - df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features]) - return df +class MinMaxScale(MLProcess): + def __init__(self, features: list,): + self.features = features + self.mms = None + + def fit(self, df: pd.DataFrame): + self.mms = MinMaxScaler() + self.mms.fit(df[self.features]) + + def transform(self, df: pd.DataFrame): + df[self.features] = self.mms.transform(df[self.features]) + return df -@registry.register("data_preprocess", MinMaxScale) -def min_max_scale(df: pd.DataFrame, features: list, ): - df[features] = MinMaxScaler().fit_transform(df[features]) - return df +class StandardScale(MLProcess): + def __init__(self, features: list,): + self.features = features + self.ss = None + def fit(self, df: pd.DataFrame): + self.ss = StandardScaler() + self.ss.fit(df[self.features]) -@registry.register("data_preprocess", StandardScale) -def standard_scale(df: pd.DataFrame, features: list, ): - df[features] = StandardScaler().fit_transform(df[features]) - return df + def transform(self, df: pd.DataFrame): + df[self.features] = self.ss.transform(df[self.features]) + return df @registry.register("data_preprocess", LogTransform) @@ -45,80 +66,145 @@ def log_transform(df: pd.DataFrame, features: list, ): return df -@registry.register("data_preprocess", MaxAbsScale) -def max_abs_scale(df: pd.DataFrame, features: list, ): - df[features] = MaxAbsScaler().fit_transform(df[features]) - return df +class MaxAbsScale(MLProcess): + def __init__(self, features: list,): + self.features = features + self.mas = None + + def fit(self, df: pd.DataFrame): + self.mas = MaxAbsScaler() + self.mas.fit(df[self.features]) + + def transform(self, df: pd.DataFrame): + df[self.features] = self.mas.transform(df[self.features]) + return df -@registry.register("data_preprocess", RobustScale) -def robust_scale(df: pd.DataFrame, features: list, ): - df[features] = RobustScaler().fit_transform(df[features]) - return df +class RobustScale(MLProcess): + def __init__(self, features: list,): + self.features = features + self.rs = None + + def fit(self, df: pd.DataFrame): + self.rs = RobustScaler() + self.rs.fit(df[self.features]) + + def transform(self, df: pd.DataFrame): + df[self.features] = self.rs.transform(df[self.features]) + return df -@registry.register("data_preprocess", OrdinalEncode) -def ordinal_encode(df: pd.DataFrame, features: list,): - df[features] = OrdinalEncoder().fit_transform(df[features]) - return df +class OrdinalEncode(MLProcess): + def __init__(self, features: list,): + self.features = features + self.oe = None + + def fit(self, df: pd.DataFrame): + self.oe = OrdinalEncoder() + self.oe.fit(df[self.features]) + + def transform(self, df: pd.DataFrame): + df[self.features] = self.oe.transform(df[self.features]) + return df -@registry.register("data_preprocess", OneHotEncoding) -def one_hot_encoding(df, cols): - enc = OneHotEncoder(handle_unknown="ignore", sparse=False) - ts_data = enc.fit_transform(df[cols]) - new_columns = enc.get_feature_names_out(cols) - ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index) - df.drop(cols, axis=1, inplace=True) - df = pd.concat([df, ts_data], axis=1) - return df +class OneHotEncode(MLProcess): + def __init__(self, features: list,): + self.features = features + self.ohe = None + + def fit(self, df: pd.DataFrame): + self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False) + self.ohe.fit(df[self.features]) + + def transform(self, df: pd.DataFrame): + ts_data = self.ohe.transform(df[self.features]) + new_columns = self.ohe.get_feature_names_out(self.features) + ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index) + df.drop(self.features, axis=1, inplace=True) + df = pd.concat([df, ts_data], axis=1) + return df -if __name__ == '__main__': - def run(): - V = { - 'a': [-1, 2, 3, 6, 5, 4], - 'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4], - 'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], - 'd': [1, None, 3, None, 5, 4], - 'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4], - 'f': ['aa', np.NAN, 'cc', None, '', 'ff'], +class LabelEncode(MLProcess): + def __init__(self, features: list,): + self.features = features + self.le_encoders = [] - } + def fit(self, df: pd.DataFrame): + for col in self.features: + le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown']) + self.le_encoders.append(le) - df = pd.DataFrame(V) - print(df.dtypes) + def transform(self, df: pd.DataFrame): + for i in range(len(self.features)): + data_list = df[self.features[i]].astype(str).tolist() + for unique_item in np.unique(df[self.features[i]].astype(str)): + if unique_item not in self.le_encoders[i].classes_: + data_list = ['unknown' if x == unique_item else x for x in data_list] + df[self.features[i]] = self.le_encoders[i].transform(data_list) + return df - numeric_features = ['a', 'b', 'd', 'e'] - numeric_features_wo_miss = ['a', 'b', ] - categorial_features = ['c', 'f'] - df_ = fill_missing_value(df.copy(), numeric_features) - print(df_) - df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe') - print(df_) +def get_column_info(df: pd.DataFrame) -> str: + data = [] + for i in df.columns: + nan_freq = float("%.2g" % (df[i].isna().mean() * 100)) + n_unique = df[i].nunique() + data.append([i, df[i].dtype, nan_freq, n_unique]) - df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999) - print(df_) - - # df_ = label_encode(df.copy(), numeric_features + categorial_features, ) - # print(df_) - - df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile') - print(df_) - - df_ = min_max_scale(df.copy(), numeric_features, ) - print(df_) - - df_ = standard_scale(df.copy(), numeric_features, ) - print(df_) - - df_ = log_transform(df.copy(), numeric_features, ) - print(df_) - - df_ = max_abs_scale(df.copy(), numeric_features, ) - print(df_) - - df_ = robust_scale(df.copy(), numeric_features, ) - print(df_) - run() \ No newline at end of file + samples = pd.DataFrame( + data, + columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"], + ) + return samples.to_string(index=False) +# +# +# if __name__ == '__main__': +# def run(): +# V = { +# 'a': [-1, 2, 3, 6, 5, 4], +# 'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4], +# 'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], +# 'd': [1, None, 3, None, 5, 4], +# 'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4], +# 'f': ['aa', np.NAN, 'cc', None, '', 'ff'], +# +# } +# +# df = pd.DataFrame(V) +# print(df.dtypes) +# +# numeric_features = ['a', 'b', 'd', 'e'] +# numeric_features_wo_miss = ['a', 'b', ] +# categorial_features = ['c', 'f'] +# +# df_ = fill_missing_value(df.copy(), numeric_features) +# print(df_) +# df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe') +# print(df_) +# +# df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999) +# print(df_) +# +# # df_ = label_encode(df.copy(), numeric_features + categorial_features, ) +# # print(df_) +# +# df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile') +# print(df_) +# +# df_ = min_max_scale(df.copy(), numeric_features, ) +# print(df_) +# +# df_ = standard_scale(df.copy(), numeric_features, ) +# print(df_) +# +# df_ = log_transform(df.copy(), numeric_features, ) +# print(df_) +# +# df_ = max_abs_scale(df.copy(), numeric_features, ) +# print(df_) +# +# df_ = robust_scale(df.copy(), numeric_features, ) +# print(df_) +# run() \ No newline at end of file diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py index 4780e4fa0..06a988d9a 100644 --- a/metagpt/tools/functions/libs/feature_engineering.py +++ b/metagpt/tools/functions/libs/feature_engineering.py @@ -3,188 +3,285 @@ # @Time : 2023/11/17 10:33 # @Author : lidanyang # @File : feature_engineering.py -# @Desc : Feature Engineering Functions +# @Desc : Feature Engineering Tools import itertools +import numpy as np from dateutil.relativedelta import relativedelta +from joblib import Parallel, delayed from pandas.api.types import is_numeric_dtype from sklearn.model_selection import KFold -from sklearn.preprocessing import PolynomialFeatures +from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer -from metagpt.tools.functions import registry +from metagpt.tools.functions.libs.base import MLProcess from metagpt.tools.functions.schemas.feature_engineering import * -@registry.register("feature_engineering", PolynomialExpansion) -def polynomial_expansion(df, cols, degree=2): - for col in cols: - if not is_numeric_dtype(df[col]): - raise ValueError(f"Column '{col}' must be numeric.") +class PolynomialExpansion(MLProcess): + def __init__(self, cols: list, degree: int = 2): + self.cols = cols + self.degree = degree + self.poly = PolynomialFeatures(degree=degree, include_bias=False) - poly = PolynomialFeatures(degree=degree, include_bias=False) - ts_data = poly.fit_transform(df[cols].fillna(0)) - new_columns = poly.get_feature_names_out(cols) - ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index) - ts_data = ts_data.drop(cols, axis=1) - df = pd.concat([df, ts_data], axis=1) - return df + def fit(self, df: pd.DataFrame): + self.poly.fit(df[self.cols].fillna(0)) + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + ts_data = self.poly.transform(df[self.cols].fillna(0)) + column_name = self.poly.get_feature_names_out(self.cols) + ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name) + df.drop(self.cols, axis=1, inplace=True) + df = pd.concat([df, ts_data], axis=1) + return df -@registry.register("feature_engineering", FrequencyEncoding) -def frequency_encoding(df, cols): - for col in cols: - encoder_dict = df[col].value_counts().to_dict() - df[f"{col}_cnt"] = df[col].map(encoder_dict) - return df +class CatCount(MLProcess): + def __init__(self, col: str): + self.col = col + self.encoder_dict = None + + def fit(self, df: pd.DataFrame): + self.encoder_dict = df[self.col].value_counts().to_dict() + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df[f"{self.col}_cnt"] = df[self.col].map(self.encoder_dict) + return df -@registry.register("feature_engineering", TargetMeanEncoder) -def target_mean_encoder(df, col, label): - encoder_dict = df.groupby(col)[label].mean().to_dict() - df[f"{col}_target_mean"] = df[col].map(encoder_dict) - return df +class TargetMeanEncoder(MLProcess): + def __init__(self, col: str, label: str): + self.col = col + self.label = label + self.encoder_dict = None + + def fit(self, df: pd.DataFrame): + self.encoder_dict = df.groupby(self.col)[self.label].mean().to_dict() + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df[f"{self.col}_target_mean"] = df[self.col].map(self.encoder_dict) + return df -@registry.register("feature_engineering", KFoldTargetMeanEncoder) -def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021): - tmp = df.copy() - kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) +class KFoldTargetMeanEncoder(MLProcess): + def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021): + self.col = col + self.label = label + self.n_splits = n_splits + self.random_state = random_state + self.encoder_dict = None - global_mean = tmp[label].mean() - col_name = f"{col}_kf_target_mean" - for trn_idx, val_idx in kf.split(tmp, tmp[label]): - _trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx] - tmp.loc[tmp.index[val_idx], col_name] = _val[col].map( - _trn.groupby(col)[label].mean() - ) - tmp[col_name].fillna(global_mean, inplace=True) - encoder_dict = tmp.groupby(col)[col_name].mean().to_dict() - df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict) - return df + def fit(self, df: pd.DataFrame): + tmp = df.copy() + kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state) - -@registry.register("feature_engineering", CatCross) -def cat_cross(df, cols, max_cat_num=100): - for col in cols: - if df[col].nunique() > max_cat_num: - cols.remove(col) - - for col1, col2 in itertools.combinations(cols, 2): - cross_col = f"{col1}_cross_{col2}" - crossed = df[col1].astype(str) + "_" + df[col2].astype(str) - df[cross_col] = crossed.astype('category').cat.codes - return df - - -@registry.register("feature_engineering", GroupStat) -def group_stat(df, group_col, agg_col, agg_funcs): - group_df = df.groupby(group_col)[agg_col].agg(agg_funcs).reset_index() - group_df.columns = group_col + [ - f"{agg_col}_{agg_func}_by_{group_col}" for agg_func in agg_funcs - ] - df = df.merge(group_df, on=group_col, how="left") - return df - - -@registry.register("feature_engineering", ExtractTimeComps) -def extract_time_comps(df, time_col, time_comps): - time_s = pd.to_datetime(df[time_col], errors="coerce") - time_comps_df = pd.DataFrame() - - if "year" in time_comps: - time_comps_df["year"] = time_s.dt.year - if "month" in time_comps: - time_comps_df["month"] = time_s.dt.month - if "day" in time_comps: - time_comps_df["day"] = time_s.dt.day - if "hour" in time_comps: - time_comps_df["hour"] = time_s.dt.hour - if "dayofweek" in time_comps: - time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1 - if "is_weekend" in time_comps: - time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int) - df = pd.concat([df, time_comps_df], axis=1) - return df - - -@registry.register("feature_engineering", FeShiftByTime) -def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq): - df[time_col] = pd.to_datetime(df[time_col]) - - def shift_datetime(date, offset, unit): - if unit in ["year", "y", "Y"]: - return date + relativedelta(years=offset) - elif unit in ["month", "m", "M"]: - return date + relativedelta(months=offset) - elif unit in ["day", "d", "D"]: - return date + relativedelta(days=offset) - elif unit in ["week", "w", "W"]: - return date + relativedelta(weeks=offset) - elif unit in ["hour", "h", "H"]: - return date + relativedelta(hours=offset) - else: - return date - - def shift_by_time_on_key( - inner_df, time_col, group_col, shift_col, offset, unit, col_name - ): - inner_df = inner_df.drop_duplicates() - inner_df[time_col] = inner_df[time_col].map( - lambda x: shift_datetime(x, offset, unit) - ) - inner_df = inner_df.groupby([time_col, group_col], as_index=False)[ - shift_col - ].mean() - inner_df.rename(columns={shift_col: col_name}, inplace=True) - return inner_df - - shift_df = df[[time_col, group_col, shift_col]].copy() - for period in periods: - new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}" - tmp = shift_by_time_on_key( - shift_df, time_col, group_col, shift_col, period, freq, new_col_name - ) - df = df.merge(tmp, on=[time_col, group_col], how="left") - - return df - - -@registry.register("feature_engineering", FeRollingByTime) -def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs): - df[time_col] = pd.to_datetime(df[time_col]) - - def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name): - time_freq = { - "Y": [365 * offset, "D"], - "M": [30 * offset, "D"], - "D": [offset, "D"], - "W": [7 * offset, "D"], - "H": [offset, "h"], - } - - if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]: - raise ValueError(f"Invalid agg function: {agg_func}") - - rolling_feat = inner_df.rolling( - f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left" - ) - rolling_feat = getattr(rolling_feat, agg_func)() - depth = df.columns.nlevels - rolling_feat = rolling_feat.stack(list(range(depth))) - rolling_feat.name = col_name - return rolling_feat - - rolling_df = df[[time_col, group_col, rolling_col]].copy() - for period in periods: - for func in agg_funcs: - new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}" - tmp = pd.pivot_table( - rolling_df, - index=time_col, - values=rolling_col, - columns=group_col, + global_mean = tmp[self.label].mean() + col_name = f"{self.col}_kf_target_mean" + for trn_idx, val_idx in kf.split(tmp, tmp[self.label]): + _trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx] + tmp.loc[tmp.index[val_idx], col_name] = _val[self.col].map( + _trn.groupby(self.col)[self.label].mean() ) - tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name) - df = df.merge(tmp, on=[time_col, group_col], how="left") + tmp[col_name].fillna(global_mean, inplace=True) + self.encoder_dict = tmp.groupby(self.col)[col_name].mean().to_dict() - return df + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df[f"{self.col}_kf_target_mean"] = df[self.col].map(self.encoder_dict) + return df + + +class CatCross(MLProcess): + def __init__(self, cols: list, max_cat_num: int = 100): + self.cols = cols + self.max_cat_num = max_cat_num + self.combs = [] + self.combs_map = {} + + @staticmethod + def cross_two(comb, df): + new_col = f'{comb[0]}_{comb[1]}' + new_col_combs = list(itertools.product(df[comb[0]].unique(), df[comb[1]].unique())) + ll = list(range(len(new_col_combs))) + comb_map = dict(zip(new_col_combs, ll)) + return new_col, comb_map + + def fit(self, df: pd.DataFrame): + for col in self.cols: + if df[col].nunique() > self.max_cat_num: + self.cols.remove(col) + self.combs = list(itertools.combinations(self.cols, 2)) + res = Parallel(n_jobs=4, require='sharedmem')( + delayed(self.cross_two)(comb, df) for comb in self.combs) + self.combs_map = dict(res) + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + for comb in self.combs: + new_col = f'{comb[0]}_{comb[1]}' + _map = self.combs_map[new_col] + df[new_col] = pd.Series(zip(df[comb[0]], df[comb[1]])).map(_map) + # set the unknown value to a new number + df[new_col].fillna(max(_map.values()) + 1, inplace=True) + df[new_col] = df[new_col].astype(int) + return df + + +class GroupStat(MLProcess): + def __init__(self, group_col: str, agg_col: str, agg_funcs: list): + self.group_col = group_col + self.agg_col = agg_col + self.agg_funcs = agg_funcs + self.group_df = None + + def fit(self, df: pd.DataFrame): + group_df = df.groupby(self.group_col)[self.agg_col].agg(self.agg_funcs).reset_index() + group_df.columns = [self.group_col] + [ + f"{self.agg_col}_{agg_func}_by_{self.group_col}" for agg_func in self.agg_funcs + ] + self.group_df = group_df + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df = df.merge(self.group_df, on=self.group_col, how="left") + return df + + +class SplitBins(MLProcess): + def __init__(self, cols: str, strategy: str = 'quantile'): + self.cols = cols + self.strategy = strategy + self.encoder = None + + def fit(self, df: pd.DataFrame): + self.encoder = KBinsDiscretizer(strategy=self.strategy, encode='ordinal') + self.encoder.fit(df[self.cols].fillna(0)) + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df[self.cols] = self.encoder.transform(df[self.cols].fillna(0)) + return df + +# @registry.register("feature_engineering", ExtractTimeComps) +# def extract_time_comps(df, time_col, time_comps): +# time_s = pd.to_datetime(df[time_col], errors="coerce") +# time_comps_df = pd.DataFrame() +# +# if "year" in time_comps: +# time_comps_df["year"] = time_s.dt.year +# if "month" in time_comps: +# time_comps_df["month"] = time_s.dt.month +# if "day" in time_comps: +# time_comps_df["day"] = time_s.dt.day +# if "hour" in time_comps: +# time_comps_df["hour"] = time_s.dt.hour +# if "dayofweek" in time_comps: +# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1 +# if "is_weekend" in time_comps: +# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int) +# df = pd.concat([df, time_comps_df], axis=1) +# return df +# +# +# @registry.register("feature_engineering", FeShiftByTime) +# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq): +# df[time_col] = pd.to_datetime(df[time_col]) +# +# def shift_datetime(date, offset, unit): +# if unit in ["year", "y", "Y"]: +# return date + relativedelta(years=offset) +# elif unit in ["month", "m", "M"]: +# return date + relativedelta(months=offset) +# elif unit in ["day", "d", "D"]: +# return date + relativedelta(days=offset) +# elif unit in ["week", "w", "W"]: +# return date + relativedelta(weeks=offset) +# elif unit in ["hour", "h", "H"]: +# return date + relativedelta(hours=offset) +# else: +# return date +# +# def shift_by_time_on_key( +# inner_df, time_col, group_col, shift_col, offset, unit, col_name +# ): +# inner_df = inner_df.drop_duplicates() +# inner_df[time_col] = inner_df[time_col].map( +# lambda x: shift_datetime(x, offset, unit) +# ) +# inner_df = inner_df.groupby([time_col, group_col], as_index=False)[ +# shift_col +# ].mean() +# inner_df.rename(columns={shift_col: col_name}, inplace=True) +# return inner_df +# +# shift_df = df[[time_col, group_col, shift_col]].copy() +# for period in periods: +# new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}" +# tmp = shift_by_time_on_key( +# shift_df, time_col, group_col, shift_col, period, freq, new_col_name +# ) +# df = df.merge(tmp, on=[time_col, group_col], how="left") +# +# return df +# +# +# @registry.register("feature_engineering", FeRollingByTime) +# def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs): +# df[time_col] = pd.to_datetime(df[time_col]) +# +# def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name): +# time_freq = { +# "Y": [365 * offset, "D"], +# "M": [30 * offset, "D"], +# "D": [offset, "D"], +# "W": [7 * offset, "D"], +# "H": [offset, "h"], +# } +# +# if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]: +# raise ValueError(f"Invalid agg function: {agg_func}") +# +# rolling_feat = inner_df.rolling( +# f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left" +# ) +# rolling_feat = getattr(rolling_feat, agg_func)() +# depth = df.columns.nlevels +# rolling_feat = rolling_feat.stack(list(range(depth))) +# rolling_feat.name = col_name +# return rolling_feat +# +# rolling_df = df[[time_col, group_col, rolling_col]].copy() +# for period in periods: +# for func in agg_funcs: +# new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}" +# tmp = pd.pivot_table( +# rolling_df, +# index=time_col, +# values=rolling_col, +# columns=group_col, +# ) +# tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name) +# df = df.merge(tmp, on=[time_col, group_col], how="left") +# +# return df + + +class GeneralSelection(MLProcess): + def __init__(self, label_col: str): + self.label_col = label_col + self.feats = [] + + def fit(self, df: pd.DataFrame): + feats = [f for f in df.columns if f != self.label_col] + for col in df.columns: + if df[col].isnull().sum() / df.shape[0] == 1: + feats.remove(col) + + if df[col].nunique() == 1: + feats.remove(col) + + if ( + df.loc[df[col] == np.inf].shape[0] != 0 + or df.loc[df[col] == np.inf].shape[0] != 0 + ): + feats.remove(col) + self.feats = feats + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df = df[self.feats] + return df