update ml tool from Function to Class

This commit is contained in:
lidanyang 2023-12-12 10:56:05 +08:00
parent fd31cc065a
commit 4f0d55656e
3 changed files with 445 additions and 246 deletions

View file

@ -0,0 +1,16 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/12/10 20:12
# @Author : lidanyang
# @File : base
# @Desc :
class MLProcess(object):
def fit(self, df):
raise NotImplementedError
def transform(self, df):
raise NotImplementedError
def fit_transform(self, df):
self.fit(df)
return self.transform(df)

View file

@ -1,6 +1,6 @@
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
@ -9,31 +9,52 @@ from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from metagpt.tools.functions import registry
from metagpt.tools.functions.libs.base import MLProcess
from metagpt.tools.functions.schemas.data_preprocess import *
@registry.register("data_preprocess", FillMissingValue)
def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
return df
class FillMissingValue(MLProcess):
def __init__(self, features: list, strategy: str = 'mean', fill_value=None,):
self.features = features
self.strategy = strategy
self.fill_value = fill_value
self.si = None
def fit(self, df: pd.DataFrame):
self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
self.si.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.si.transform(df[self.features])
return df
@registry.register("data_preprocess", SplitBins)
def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
return df
class MinMaxScale(MLProcess):
def __init__(self, features: list,):
self.features = features
self.mms = None
def fit(self, df: pd.DataFrame):
self.mms = MinMaxScaler()
self.mms.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.mms.transform(df[self.features])
return df
@registry.register("data_preprocess", MinMaxScale)
def min_max_scale(df: pd.DataFrame, features: list, ):
df[features] = MinMaxScaler().fit_transform(df[features])
return df
class StandardScale(MLProcess):
def __init__(self, features: list,):
self.features = features
self.ss = None
def fit(self, df: pd.DataFrame):
self.ss = StandardScaler()
self.ss.fit(df[self.features])
@registry.register("data_preprocess", StandardScale)
def standard_scale(df: pd.DataFrame, features: list, ):
df[features] = StandardScaler().fit_transform(df[features])
return df
def transform(self, df: pd.DataFrame):
df[self.features] = self.ss.transform(df[self.features])
return df
@registry.register("data_preprocess", LogTransform)
@ -45,80 +66,145 @@ def log_transform(df: pd.DataFrame, features: list, ):
return df
@registry.register("data_preprocess", MaxAbsScale)
def max_abs_scale(df: pd.DataFrame, features: list, ):
df[features] = MaxAbsScaler().fit_transform(df[features])
return df
class MaxAbsScale(MLProcess):
def __init__(self, features: list,):
self.features = features
self.mas = None
def fit(self, df: pd.DataFrame):
self.mas = MaxAbsScaler()
self.mas.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.mas.transform(df[self.features])
return df
@registry.register("data_preprocess", RobustScale)
def robust_scale(df: pd.DataFrame, features: list, ):
df[features] = RobustScaler().fit_transform(df[features])
return df
class RobustScale(MLProcess):
def __init__(self, features: list,):
self.features = features
self.rs = None
def fit(self, df: pd.DataFrame):
self.rs = RobustScaler()
self.rs.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.rs.transform(df[self.features])
return df
@registry.register("data_preprocess", OrdinalEncode)
def ordinal_encode(df: pd.DataFrame, features: list,):
df[features] = OrdinalEncoder().fit_transform(df[features])
return df
class OrdinalEncode(MLProcess):
def __init__(self, features: list,):
self.features = features
self.oe = None
def fit(self, df: pd.DataFrame):
self.oe = OrdinalEncoder()
self.oe.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.oe.transform(df[self.features])
return df
@registry.register("data_preprocess", OneHotEncoding)
def one_hot_encoding(df, cols):
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
ts_data = enc.fit_transform(df[cols])
new_columns = enc.get_feature_names_out(cols)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
df.drop(cols, axis=1, inplace=True)
df = pd.concat([df, ts_data], axis=1)
return df
class OneHotEncode(MLProcess):
def __init__(self, features: list,):
self.features = features
self.ohe = None
def fit(self, df: pd.DataFrame):
self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
self.ohe.fit(df[self.features])
def transform(self, df: pd.DataFrame):
ts_data = self.ohe.transform(df[self.features])
new_columns = self.ohe.get_feature_names_out(self.features)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
df.drop(self.features, axis=1, inplace=True)
df = pd.concat([df, ts_data], axis=1)
return df
if __name__ == '__main__':
def run():
V = {
'a': [-1, 2, 3, 6, 5, 4],
'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
'd': [1, None, 3, None, 5, 4],
'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
class LabelEncode(MLProcess):
def __init__(self, features: list,):
self.features = features
self.le_encoders = []
}
def fit(self, df: pd.DataFrame):
for col in self.features:
le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
self.le_encoders.append(le)
df = pd.DataFrame(V)
print(df.dtypes)
def transform(self, df: pd.DataFrame):
for i in range(len(self.features)):
data_list = df[self.features[i]].astype(str).tolist()
for unique_item in np.unique(df[self.features[i]].astype(str)):
if unique_item not in self.le_encoders[i].classes_:
data_list = ['unknown' if x == unique_item else x for x in data_list]
df[self.features[i]] = self.le_encoders[i].transform(data_list)
return df
numeric_features = ['a', 'b', 'd', 'e']
numeric_features_wo_miss = ['a', 'b', ]
categorial_features = ['c', 'f']
df_ = fill_missing_value(df.copy(), numeric_features)
print(df_)
df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
print(df_)
def get_column_info(df: pd.DataFrame) -> str:
data = []
for i in df.columns:
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
n_unique = df[i].nunique()
data.append([i, df[i].dtype, nan_freq, n_unique])
df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
print(df_)
# df_ = label_encode(df.copy(), numeric_features + categorial_features, )
# print(df_)
df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
print(df_)
df_ = min_max_scale(df.copy(), numeric_features, )
print(df_)
df_ = standard_scale(df.copy(), numeric_features, )
print(df_)
df_ = log_transform(df.copy(), numeric_features, )
print(df_)
df_ = max_abs_scale(df.copy(), numeric_features, )
print(df_)
df_ = robust_scale(df.copy(), numeric_features, )
print(df_)
run()
samples = pd.DataFrame(
data,
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
)
return samples.to_string(index=False)
#
#
# if __name__ == '__main__':
# def run():
# V = {
# 'a': [-1, 2, 3, 6, 5, 4],
# 'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
# 'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
# 'd': [1, None, 3, None, 5, 4],
# 'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
# 'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
#
# }
#
# df = pd.DataFrame(V)
# print(df.dtypes)
#
# numeric_features = ['a', 'b', 'd', 'e']
# numeric_features_wo_miss = ['a', 'b', ]
# categorial_features = ['c', 'f']
#
# df_ = fill_missing_value(df.copy(), numeric_features)
# print(df_)
# df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
# print(df_)
#
# df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
# print(df_)
#
# # df_ = label_encode(df.copy(), numeric_features + categorial_features, )
# # print(df_)
#
# df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
# print(df_)
#
# df_ = min_max_scale(df.copy(), numeric_features, )
# print(df_)
#
# df_ = standard_scale(df.copy(), numeric_features, )
# print(df_)
#
# df_ = log_transform(df.copy(), numeric_features, )
# print(df_)
#
# df_ = max_abs_scale(df.copy(), numeric_features, )
# print(df_)
#
# df_ = robust_scale(df.copy(), numeric_features, )
# print(df_)
# run()

View file

@ -3,188 +3,285 @@
# @Time : 2023/11/17 10:33
# @Author : lidanyang
# @File : feature_engineering.py
# @Desc : Feature Engineering Functions
# @Desc : Feature Engineering Tools
import itertools
import numpy as np
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from pandas.api.types import is_numeric_dtype
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
from metagpt.tools.functions import registry
from metagpt.tools.functions.libs.base import MLProcess
from metagpt.tools.functions.schemas.feature_engineering import *
@registry.register("feature_engineering", PolynomialExpansion)
def polynomial_expansion(df, cols, degree=2):
for col in cols:
if not is_numeric_dtype(df[col]):
raise ValueError(f"Column '{col}' must be numeric.")
class PolynomialExpansion(MLProcess):
def __init__(self, cols: list, degree: int = 2):
self.cols = cols
self.degree = degree
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
poly = PolynomialFeatures(degree=degree, include_bias=False)
ts_data = poly.fit_transform(df[cols].fillna(0))
new_columns = poly.get_feature_names_out(cols)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
ts_data = ts_data.drop(cols, axis=1)
df = pd.concat([df, ts_data], axis=1)
return df
def fit(self, df: pd.DataFrame):
self.poly.fit(df[self.cols].fillna(0))
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
ts_data = self.poly.transform(df[self.cols].fillna(0))
column_name = self.poly.get_feature_names_out(self.cols)
ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
df.drop(self.cols, axis=1, inplace=True)
df = pd.concat([df, ts_data], axis=1)
return df
@registry.register("feature_engineering", FrequencyEncoding)
def frequency_encoding(df, cols):
for col in cols:
encoder_dict = df[col].value_counts().to_dict()
df[f"{col}_cnt"] = df[col].map(encoder_dict)
return df
class CatCount(MLProcess):
def __init__(self, col: str):
self.col = col
self.encoder_dict = None
def fit(self, df: pd.DataFrame):
self.encoder_dict = df[self.col].value_counts().to_dict()
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[f"{self.col}_cnt"] = df[self.col].map(self.encoder_dict)
return df
@registry.register("feature_engineering", TargetMeanEncoder)
def target_mean_encoder(df, col, label):
encoder_dict = df.groupby(col)[label].mean().to_dict()
df[f"{col}_target_mean"] = df[col].map(encoder_dict)
return df
class TargetMeanEncoder(MLProcess):
def __init__(self, col: str, label: str):
self.col = col
self.label = label
self.encoder_dict = None
def fit(self, df: pd.DataFrame):
self.encoder_dict = df.groupby(self.col)[self.label].mean().to_dict()
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[f"{self.col}_target_mean"] = df[self.col].map(self.encoder_dict)
return df
@registry.register("feature_engineering", KFoldTargetMeanEncoder)
def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021):
tmp = df.copy()
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
class KFoldTargetMeanEncoder(MLProcess):
def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
self.col = col
self.label = label
self.n_splits = n_splits
self.random_state = random_state
self.encoder_dict = None
global_mean = tmp[label].mean()
col_name = f"{col}_kf_target_mean"
for trn_idx, val_idx in kf.split(tmp, tmp[label]):
_trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
tmp.loc[tmp.index[val_idx], col_name] = _val[col].map(
_trn.groupby(col)[label].mean()
)
tmp[col_name].fillna(global_mean, inplace=True)
encoder_dict = tmp.groupby(col)[col_name].mean().to_dict()
df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict)
return df
def fit(self, df: pd.DataFrame):
tmp = df.copy()
kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
@registry.register("feature_engineering", CatCross)
def cat_cross(df, cols, max_cat_num=100):
for col in cols:
if df[col].nunique() > max_cat_num:
cols.remove(col)
for col1, col2 in itertools.combinations(cols, 2):
cross_col = f"{col1}_cross_{col2}"
crossed = df[col1].astype(str) + "_" + df[col2].astype(str)
df[cross_col] = crossed.astype('category').cat.codes
return df
@registry.register("feature_engineering", GroupStat)
def group_stat(df, group_col, agg_col, agg_funcs):
group_df = df.groupby(group_col)[agg_col].agg(agg_funcs).reset_index()
group_df.columns = group_col + [
f"{agg_col}_{agg_func}_by_{group_col}" for agg_func in agg_funcs
]
df = df.merge(group_df, on=group_col, how="left")
return df
@registry.register("feature_engineering", ExtractTimeComps)
def extract_time_comps(df, time_col, time_comps):
time_s = pd.to_datetime(df[time_col], errors="coerce")
time_comps_df = pd.DataFrame()
if "year" in time_comps:
time_comps_df["year"] = time_s.dt.year
if "month" in time_comps:
time_comps_df["month"] = time_s.dt.month
if "day" in time_comps:
time_comps_df["day"] = time_s.dt.day
if "hour" in time_comps:
time_comps_df["hour"] = time_s.dt.hour
if "dayofweek" in time_comps:
time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
if "is_weekend" in time_comps:
time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
df = pd.concat([df, time_comps_df], axis=1)
return df
@registry.register("feature_engineering", FeShiftByTime)
def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
df[time_col] = pd.to_datetime(df[time_col])
def shift_datetime(date, offset, unit):
if unit in ["year", "y", "Y"]:
return date + relativedelta(years=offset)
elif unit in ["month", "m", "M"]:
return date + relativedelta(months=offset)
elif unit in ["day", "d", "D"]:
return date + relativedelta(days=offset)
elif unit in ["week", "w", "W"]:
return date + relativedelta(weeks=offset)
elif unit in ["hour", "h", "H"]:
return date + relativedelta(hours=offset)
else:
return date
def shift_by_time_on_key(
inner_df, time_col, group_col, shift_col, offset, unit, col_name
):
inner_df = inner_df.drop_duplicates()
inner_df[time_col] = inner_df[time_col].map(
lambda x: shift_datetime(x, offset, unit)
)
inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
shift_col
].mean()
inner_df.rename(columns={shift_col: col_name}, inplace=True)
return inner_df
shift_df = df[[time_col, group_col, shift_col]].copy()
for period in periods:
new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
tmp = shift_by_time_on_key(
shift_df, time_col, group_col, shift_col, period, freq, new_col_name
)
df = df.merge(tmp, on=[time_col, group_col], how="left")
return df
@registry.register("feature_engineering", FeRollingByTime)
def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
df[time_col] = pd.to_datetime(df[time_col])
def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
time_freq = {
"Y": [365 * offset, "D"],
"M": [30 * offset, "D"],
"D": [offset, "D"],
"W": [7 * offset, "D"],
"H": [offset, "h"],
}
if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
raise ValueError(f"Invalid agg function: {agg_func}")
rolling_feat = inner_df.rolling(
f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
)
rolling_feat = getattr(rolling_feat, agg_func)()
depth = df.columns.nlevels
rolling_feat = rolling_feat.stack(list(range(depth)))
rolling_feat.name = col_name
return rolling_feat
rolling_df = df[[time_col, group_col, rolling_col]].copy()
for period in periods:
for func in agg_funcs:
new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
tmp = pd.pivot_table(
rolling_df,
index=time_col,
values=rolling_col,
columns=group_col,
global_mean = tmp[self.label].mean()
col_name = f"{self.col}_kf_target_mean"
for trn_idx, val_idx in kf.split(tmp, tmp[self.label]):
_trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
tmp.loc[tmp.index[val_idx], col_name] = _val[self.col].map(
_trn.groupby(self.col)[self.label].mean()
)
tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
df = df.merge(tmp, on=[time_col, group_col], how="left")
tmp[col_name].fillna(global_mean, inplace=True)
self.encoder_dict = tmp.groupby(self.col)[col_name].mean().to_dict()
return df
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[f"{self.col}_kf_target_mean"] = df[self.col].map(self.encoder_dict)
return df
class CatCross(MLProcess):
def __init__(self, cols: list, max_cat_num: int = 100):
self.cols = cols
self.max_cat_num = max_cat_num
self.combs = []
self.combs_map = {}
@staticmethod
def cross_two(comb, df):
new_col = f'{comb[0]}_{comb[1]}'
new_col_combs = list(itertools.product(df[comb[0]].unique(), df[comb[1]].unique()))
ll = list(range(len(new_col_combs)))
comb_map = dict(zip(new_col_combs, ll))
return new_col, comb_map
def fit(self, df: pd.DataFrame):
for col in self.cols:
if df[col].nunique() > self.max_cat_num:
self.cols.remove(col)
self.combs = list(itertools.combinations(self.cols, 2))
res = Parallel(n_jobs=4, require='sharedmem')(
delayed(self.cross_two)(comb, df) for comb in self.combs)
self.combs_map = dict(res)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
for comb in self.combs:
new_col = f'{comb[0]}_{comb[1]}'
_map = self.combs_map[new_col]
df[new_col] = pd.Series(zip(df[comb[0]], df[comb[1]])).map(_map)
# set the unknown value to a new number
df[new_col].fillna(max(_map.values()) + 1, inplace=True)
df[new_col] = df[new_col].astype(int)
return df
class GroupStat(MLProcess):
def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
self.group_col = group_col
self.agg_col = agg_col
self.agg_funcs = agg_funcs
self.group_df = None
def fit(self, df: pd.DataFrame):
group_df = df.groupby(self.group_col)[self.agg_col].agg(self.agg_funcs).reset_index()
group_df.columns = [self.group_col] + [
f"{self.agg_col}_{agg_func}_by_{self.group_col}" for agg_func in self.agg_funcs
]
self.group_df = group_df
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.merge(self.group_df, on=self.group_col, how="left")
return df
class SplitBins(MLProcess):
def __init__(self, cols: str, strategy: str = 'quantile'):
self.cols = cols
self.strategy = strategy
self.encoder = None
def fit(self, df: pd.DataFrame):
self.encoder = KBinsDiscretizer(strategy=self.strategy, encode='ordinal')
self.encoder.fit(df[self.cols].fillna(0))
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
return df
# @registry.register("feature_engineering", ExtractTimeComps)
# def extract_time_comps(df, time_col, time_comps):
# time_s = pd.to_datetime(df[time_col], errors="coerce")
# time_comps_df = pd.DataFrame()
#
# if "year" in time_comps:
# time_comps_df["year"] = time_s.dt.year
# if "month" in time_comps:
# time_comps_df["month"] = time_s.dt.month
# if "day" in time_comps:
# time_comps_df["day"] = time_s.dt.day
# if "hour" in time_comps:
# time_comps_df["hour"] = time_s.dt.hour
# if "dayofweek" in time_comps:
# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
# if "is_weekend" in time_comps:
# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
# df = pd.concat([df, time_comps_df], axis=1)
# return df
#
#
# @registry.register("feature_engineering", FeShiftByTime)
# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
# df[time_col] = pd.to_datetime(df[time_col])
#
# def shift_datetime(date, offset, unit):
# if unit in ["year", "y", "Y"]:
# return date + relativedelta(years=offset)
# elif unit in ["month", "m", "M"]:
# return date + relativedelta(months=offset)
# elif unit in ["day", "d", "D"]:
# return date + relativedelta(days=offset)
# elif unit in ["week", "w", "W"]:
# return date + relativedelta(weeks=offset)
# elif unit in ["hour", "h", "H"]:
# return date + relativedelta(hours=offset)
# else:
# return date
#
# def shift_by_time_on_key(
# inner_df, time_col, group_col, shift_col, offset, unit, col_name
# ):
# inner_df = inner_df.drop_duplicates()
# inner_df[time_col] = inner_df[time_col].map(
# lambda x: shift_datetime(x, offset, unit)
# )
# inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
# shift_col
# ].mean()
# inner_df.rename(columns={shift_col: col_name}, inplace=True)
# return inner_df
#
# shift_df = df[[time_col, group_col, shift_col]].copy()
# for period in periods:
# new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
# tmp = shift_by_time_on_key(
# shift_df, time_col, group_col, shift_col, period, freq, new_col_name
# )
# df = df.merge(tmp, on=[time_col, group_col], how="left")
#
# return df
#
#
# @registry.register("feature_engineering", FeRollingByTime)
# def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
# df[time_col] = pd.to_datetime(df[time_col])
#
# def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
# time_freq = {
# "Y": [365 * offset, "D"],
# "M": [30 * offset, "D"],
# "D": [offset, "D"],
# "W": [7 * offset, "D"],
# "H": [offset, "h"],
# }
#
# if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
# raise ValueError(f"Invalid agg function: {agg_func}")
#
# rolling_feat = inner_df.rolling(
# f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
# )
# rolling_feat = getattr(rolling_feat, agg_func)()
# depth = df.columns.nlevels
# rolling_feat = rolling_feat.stack(list(range(depth)))
# rolling_feat.name = col_name
# return rolling_feat
#
# rolling_df = df[[time_col, group_col, rolling_col]].copy()
# for period in periods:
# for func in agg_funcs:
# new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
# tmp = pd.pivot_table(
# rolling_df,
# index=time_col,
# values=rolling_col,
# columns=group_col,
# )
# tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
# df = df.merge(tmp, on=[time_col, group_col], how="left")
#
# return df
class GeneralSelection(MLProcess):
def __init__(self, label_col: str):
self.label_col = label_col
self.feats = []
def fit(self, df: pd.DataFrame):
feats = [f for f in df.columns if f != self.label_col]
for col in df.columns:
if df[col].isnull().sum() / df.shape[0] == 1:
feats.remove(col)
if df[col].nunique() == 1:
feats.remove(col)
if (
df.loc[df[col] == np.inf].shape[0] != 0
or df.loc[df[col] == np.inf].shape[0] != 0
):
feats.remove(col)
self.feats = feats
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[self.feats]
return df