mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-30 14:35:17 +02:00
update ml tool from Function to Class
This commit is contained in:
parent
fd31cc065a
commit
4f0d55656e
3 changed files with 445 additions and 246 deletions
16
metagpt/tools/functions/libs/base.py
Normal file
16
metagpt/tools/functions/libs/base.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2023/12/10 20:12
|
||||
# @Author : lidanyang
|
||||
# @File : base
|
||||
# @Desc :
|
||||
class MLProcess(object):
|
||||
def fit(self, df):
|
||||
raise NotImplementedError
|
||||
|
||||
def transform(self, df):
|
||||
raise NotImplementedError
|
||||
|
||||
def fit_transform(self, df):
|
||||
self.fit(df)
|
||||
return self.transform(df)
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import numpy as np
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.preprocessing import KBinsDiscretizer
|
||||
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
|
||||
from sklearn.preprocessing import MaxAbsScaler
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
|
|
@ -9,31 +9,52 @@ from sklearn.preprocessing import RobustScaler
|
|||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from metagpt.tools.functions import registry
|
||||
from metagpt.tools.functions.libs.base import MLProcess
|
||||
from metagpt.tools.functions.schemas.data_preprocess import *
|
||||
|
||||
|
||||
@registry.register("data_preprocess", FillMissingValue)
|
||||
def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
|
||||
df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
|
||||
return df
|
||||
class FillMissingValue(MLProcess):
|
||||
def __init__(self, features: list, strategy: str = 'mean', fill_value=None,):
|
||||
self.features = features
|
||||
self.strategy = strategy
|
||||
self.fill_value = fill_value
|
||||
self.si = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
|
||||
self.si.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.si.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", SplitBins)
|
||||
def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
|
||||
df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
|
||||
return df
|
||||
class MinMaxScale(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.mms = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.mms = MinMaxScaler()
|
||||
self.mms.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.mms.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", MinMaxScale)
|
||||
def min_max_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = MinMaxScaler().fit_transform(df[features])
|
||||
return df
|
||||
class StandardScale(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.ss = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.ss = StandardScaler()
|
||||
self.ss.fit(df[self.features])
|
||||
|
||||
@registry.register("data_preprocess", StandardScale)
|
||||
def standard_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = StandardScaler().fit_transform(df[features])
|
||||
return df
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.ss.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", LogTransform)
|
||||
|
|
@ -45,80 +66,145 @@ def log_transform(df: pd.DataFrame, features: list, ):
|
|||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", MaxAbsScale)
|
||||
def max_abs_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = MaxAbsScaler().fit_transform(df[features])
|
||||
return df
|
||||
class MaxAbsScale(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.mas = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.mas = MaxAbsScaler()
|
||||
self.mas.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.mas.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", RobustScale)
|
||||
def robust_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = RobustScaler().fit_transform(df[features])
|
||||
return df
|
||||
class RobustScale(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.rs = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.rs = RobustScaler()
|
||||
self.rs.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.rs.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", OrdinalEncode)
|
||||
def ordinal_encode(df: pd.DataFrame, features: list,):
|
||||
df[features] = OrdinalEncoder().fit_transform(df[features])
|
||||
return df
|
||||
class OrdinalEncode(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.oe = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.oe = OrdinalEncoder()
|
||||
self.oe.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.oe.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", OneHotEncoding)
|
||||
def one_hot_encoding(df, cols):
|
||||
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
ts_data = enc.fit_transform(df[cols])
|
||||
new_columns = enc.get_feature_names_out(cols)
|
||||
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
|
||||
df.drop(cols, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
class OneHotEncode(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.ohe = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
self.ohe.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
ts_data = self.ohe.transform(df[self.features])
|
||||
new_columns = self.ohe.get_feature_names_out(self.features)
|
||||
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
|
||||
df.drop(self.features, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def run():
|
||||
V = {
|
||||
'a': [-1, 2, 3, 6, 5, 4],
|
||||
'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
|
||||
'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
|
||||
'd': [1, None, 3, None, 5, 4],
|
||||
'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
|
||||
'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
|
||||
class LabelEncode(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.le_encoders = []
|
||||
|
||||
}
|
||||
def fit(self, df: pd.DataFrame):
|
||||
for col in self.features:
|
||||
le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
|
||||
self.le_encoders.append(le)
|
||||
|
||||
df = pd.DataFrame(V)
|
||||
print(df.dtypes)
|
||||
def transform(self, df: pd.DataFrame):
|
||||
for i in range(len(self.features)):
|
||||
data_list = df[self.features[i]].astype(str).tolist()
|
||||
for unique_item in np.unique(df[self.features[i]].astype(str)):
|
||||
if unique_item not in self.le_encoders[i].classes_:
|
||||
data_list = ['unknown' if x == unique_item else x for x in data_list]
|
||||
df[self.features[i]] = self.le_encoders[i].transform(data_list)
|
||||
return df
|
||||
|
||||
numeric_features = ['a', 'b', 'd', 'e']
|
||||
numeric_features_wo_miss = ['a', 'b', ]
|
||||
categorial_features = ['c', 'f']
|
||||
|
||||
df_ = fill_missing_value(df.copy(), numeric_features)
|
||||
print(df_)
|
||||
df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
|
||||
print(df_)
|
||||
def get_column_info(df: pd.DataFrame) -> str:
|
||||
data = []
|
||||
for i in df.columns:
|
||||
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
|
||||
n_unique = df[i].nunique()
|
||||
data.append([i, df[i].dtype, nan_freq, n_unique])
|
||||
|
||||
df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
|
||||
print(df_)
|
||||
|
||||
# df_ = label_encode(df.copy(), numeric_features + categorial_features, )
|
||||
# print(df_)
|
||||
|
||||
df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
|
||||
print(df_)
|
||||
|
||||
df_ = min_max_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = standard_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = log_transform(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = max_abs_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = robust_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
run()
|
||||
samples = pd.DataFrame(
|
||||
data,
|
||||
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
|
||||
)
|
||||
return samples.to_string(index=False)
|
||||
#
|
||||
#
|
||||
# if __name__ == '__main__':
|
||||
# def run():
|
||||
# V = {
|
||||
# 'a': [-1, 2, 3, 6, 5, 4],
|
||||
# 'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
|
||||
# 'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
|
||||
# 'd': [1, None, 3, None, 5, 4],
|
||||
# 'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
|
||||
# 'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
|
||||
#
|
||||
# }
|
||||
#
|
||||
# df = pd.DataFrame(V)
|
||||
# print(df.dtypes)
|
||||
#
|
||||
# numeric_features = ['a', 'b', 'd', 'e']
|
||||
# numeric_features_wo_miss = ['a', 'b', ]
|
||||
# categorial_features = ['c', 'f']
|
||||
#
|
||||
# df_ = fill_missing_value(df.copy(), numeric_features)
|
||||
# print(df_)
|
||||
# df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
|
||||
# print(df_)
|
||||
#
|
||||
# df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
|
||||
# print(df_)
|
||||
#
|
||||
# # df_ = label_encode(df.copy(), numeric_features + categorial_features, )
|
||||
# # print(df_)
|
||||
#
|
||||
# df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
|
||||
# print(df_)
|
||||
#
|
||||
# df_ = min_max_scale(df.copy(), numeric_features, )
|
||||
# print(df_)
|
||||
#
|
||||
# df_ = standard_scale(df.copy(), numeric_features, )
|
||||
# print(df_)
|
||||
#
|
||||
# df_ = log_transform(df.copy(), numeric_features, )
|
||||
# print(df_)
|
||||
#
|
||||
# df_ = max_abs_scale(df.copy(), numeric_features, )
|
||||
# print(df_)
|
||||
#
|
||||
# df_ = robust_scale(df.copy(), numeric_features, )
|
||||
# print(df_)
|
||||
# run()
|
||||
|
|
@ -3,188 +3,285 @@
|
|||
# @Time : 2023/11/17 10:33
|
||||
# @Author : lidanyang
|
||||
# @File : feature_engineering.py
|
||||
# @Desc : Feature Engineering Functions
|
||||
# @Desc : Feature Engineering Tools
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from joblib import Parallel, delayed
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.preprocessing import PolynomialFeatures
|
||||
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
|
||||
|
||||
from metagpt.tools.functions import registry
|
||||
from metagpt.tools.functions.libs.base import MLProcess
|
||||
from metagpt.tools.functions.schemas.feature_engineering import *
|
||||
|
||||
|
||||
@registry.register("feature_engineering", PolynomialExpansion)
|
||||
def polynomial_expansion(df, cols, degree=2):
|
||||
for col in cols:
|
||||
if not is_numeric_dtype(df[col]):
|
||||
raise ValueError(f"Column '{col}' must be numeric.")
|
||||
class PolynomialExpansion(MLProcess):
|
||||
def __init__(self, cols: list, degree: int = 2):
|
||||
self.cols = cols
|
||||
self.degree = degree
|
||||
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
|
||||
|
||||
poly = PolynomialFeatures(degree=degree, include_bias=False)
|
||||
ts_data = poly.fit_transform(df[cols].fillna(0))
|
||||
new_columns = poly.get_feature_names_out(cols)
|
||||
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
|
||||
ts_data = ts_data.drop(cols, axis=1)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.poly.fit(df[self.cols].fillna(0))
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
ts_data = self.poly.transform(df[self.cols].fillna(0))
|
||||
column_name = self.poly.get_feature_names_out(self.cols)
|
||||
ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
|
||||
df.drop(self.cols, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", FrequencyEncoding)
|
||||
def frequency_encoding(df, cols):
|
||||
for col in cols:
|
||||
encoder_dict = df[col].value_counts().to_dict()
|
||||
df[f"{col}_cnt"] = df[col].map(encoder_dict)
|
||||
return df
|
||||
class CatCount(MLProcess):
|
||||
def __init__(self, col: str):
|
||||
self.col = col
|
||||
self.encoder_dict = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.encoder_dict = df[self.col].value_counts().to_dict()
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[f"{self.col}_cnt"] = df[self.col].map(self.encoder_dict)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", TargetMeanEncoder)
|
||||
def target_mean_encoder(df, col, label):
|
||||
encoder_dict = df.groupby(col)[label].mean().to_dict()
|
||||
df[f"{col}_target_mean"] = df[col].map(encoder_dict)
|
||||
return df
|
||||
class TargetMeanEncoder(MLProcess):
|
||||
def __init__(self, col: str, label: str):
|
||||
self.col = col
|
||||
self.label = label
|
||||
self.encoder_dict = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.encoder_dict = df.groupby(self.col)[self.label].mean().to_dict()
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[f"{self.col}_target_mean"] = df[self.col].map(self.encoder_dict)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", KFoldTargetMeanEncoder)
|
||||
def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021):
|
||||
tmp = df.copy()
|
||||
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
|
||||
class KFoldTargetMeanEncoder(MLProcess):
|
||||
def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
|
||||
self.col = col
|
||||
self.label = label
|
||||
self.n_splits = n_splits
|
||||
self.random_state = random_state
|
||||
self.encoder_dict = None
|
||||
|
||||
global_mean = tmp[label].mean()
|
||||
col_name = f"{col}_kf_target_mean"
|
||||
for trn_idx, val_idx in kf.split(tmp, tmp[label]):
|
||||
_trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
|
||||
tmp.loc[tmp.index[val_idx], col_name] = _val[col].map(
|
||||
_trn.groupby(col)[label].mean()
|
||||
)
|
||||
tmp[col_name].fillna(global_mean, inplace=True)
|
||||
encoder_dict = tmp.groupby(col)[col_name].mean().to_dict()
|
||||
df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict)
|
||||
return df
|
||||
def fit(self, df: pd.DataFrame):
|
||||
tmp = df.copy()
|
||||
kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
|
||||
|
||||
|
||||
@registry.register("feature_engineering", CatCross)
|
||||
def cat_cross(df, cols, max_cat_num=100):
|
||||
for col in cols:
|
||||
if df[col].nunique() > max_cat_num:
|
||||
cols.remove(col)
|
||||
|
||||
for col1, col2 in itertools.combinations(cols, 2):
|
||||
cross_col = f"{col1}_cross_{col2}"
|
||||
crossed = df[col1].astype(str) + "_" + df[col2].astype(str)
|
||||
df[cross_col] = crossed.astype('category').cat.codes
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", GroupStat)
|
||||
def group_stat(df, group_col, agg_col, agg_funcs):
|
||||
group_df = df.groupby(group_col)[agg_col].agg(agg_funcs).reset_index()
|
||||
group_df.columns = group_col + [
|
||||
f"{agg_col}_{agg_func}_by_{group_col}" for agg_func in agg_funcs
|
||||
]
|
||||
df = df.merge(group_df, on=group_col, how="left")
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", ExtractTimeComps)
|
||||
def extract_time_comps(df, time_col, time_comps):
|
||||
time_s = pd.to_datetime(df[time_col], errors="coerce")
|
||||
time_comps_df = pd.DataFrame()
|
||||
|
||||
if "year" in time_comps:
|
||||
time_comps_df["year"] = time_s.dt.year
|
||||
if "month" in time_comps:
|
||||
time_comps_df["month"] = time_s.dt.month
|
||||
if "day" in time_comps:
|
||||
time_comps_df["day"] = time_s.dt.day
|
||||
if "hour" in time_comps:
|
||||
time_comps_df["hour"] = time_s.dt.hour
|
||||
if "dayofweek" in time_comps:
|
||||
time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
|
||||
if "is_weekend" in time_comps:
|
||||
time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
|
||||
df = pd.concat([df, time_comps_df], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", FeShiftByTime)
|
||||
def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
|
||||
df[time_col] = pd.to_datetime(df[time_col])
|
||||
|
||||
def shift_datetime(date, offset, unit):
|
||||
if unit in ["year", "y", "Y"]:
|
||||
return date + relativedelta(years=offset)
|
||||
elif unit in ["month", "m", "M"]:
|
||||
return date + relativedelta(months=offset)
|
||||
elif unit in ["day", "d", "D"]:
|
||||
return date + relativedelta(days=offset)
|
||||
elif unit in ["week", "w", "W"]:
|
||||
return date + relativedelta(weeks=offset)
|
||||
elif unit in ["hour", "h", "H"]:
|
||||
return date + relativedelta(hours=offset)
|
||||
else:
|
||||
return date
|
||||
|
||||
def shift_by_time_on_key(
|
||||
inner_df, time_col, group_col, shift_col, offset, unit, col_name
|
||||
):
|
||||
inner_df = inner_df.drop_duplicates()
|
||||
inner_df[time_col] = inner_df[time_col].map(
|
||||
lambda x: shift_datetime(x, offset, unit)
|
||||
)
|
||||
inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
|
||||
shift_col
|
||||
].mean()
|
||||
inner_df.rename(columns={shift_col: col_name}, inplace=True)
|
||||
return inner_df
|
||||
|
||||
shift_df = df[[time_col, group_col, shift_col]].copy()
|
||||
for period in periods:
|
||||
new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
|
||||
tmp = shift_by_time_on_key(
|
||||
shift_df, time_col, group_col, shift_col, period, freq, new_col_name
|
||||
)
|
||||
df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", FeRollingByTime)
|
||||
def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
|
||||
df[time_col] = pd.to_datetime(df[time_col])
|
||||
|
||||
def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
|
||||
time_freq = {
|
||||
"Y": [365 * offset, "D"],
|
||||
"M": [30 * offset, "D"],
|
||||
"D": [offset, "D"],
|
||||
"W": [7 * offset, "D"],
|
||||
"H": [offset, "h"],
|
||||
}
|
||||
|
||||
if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
|
||||
raise ValueError(f"Invalid agg function: {agg_func}")
|
||||
|
||||
rolling_feat = inner_df.rolling(
|
||||
f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
|
||||
)
|
||||
rolling_feat = getattr(rolling_feat, agg_func)()
|
||||
depth = df.columns.nlevels
|
||||
rolling_feat = rolling_feat.stack(list(range(depth)))
|
||||
rolling_feat.name = col_name
|
||||
return rolling_feat
|
||||
|
||||
rolling_df = df[[time_col, group_col, rolling_col]].copy()
|
||||
for period in periods:
|
||||
for func in agg_funcs:
|
||||
new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
|
||||
tmp = pd.pivot_table(
|
||||
rolling_df,
|
||||
index=time_col,
|
||||
values=rolling_col,
|
||||
columns=group_col,
|
||||
global_mean = tmp[self.label].mean()
|
||||
col_name = f"{self.col}_kf_target_mean"
|
||||
for trn_idx, val_idx in kf.split(tmp, tmp[self.label]):
|
||||
_trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
|
||||
tmp.loc[tmp.index[val_idx], col_name] = _val[self.col].map(
|
||||
_trn.groupby(self.col)[self.label].mean()
|
||||
)
|
||||
tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
|
||||
df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
tmp[col_name].fillna(global_mean, inplace=True)
|
||||
self.encoder_dict = tmp.groupby(self.col)[col_name].mean().to_dict()
|
||||
|
||||
return df
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[f"{self.col}_kf_target_mean"] = df[self.col].map(self.encoder_dict)
|
||||
return df
|
||||
|
||||
|
||||
class CatCross(MLProcess):
|
||||
def __init__(self, cols: list, max_cat_num: int = 100):
|
||||
self.cols = cols
|
||||
self.max_cat_num = max_cat_num
|
||||
self.combs = []
|
||||
self.combs_map = {}
|
||||
|
||||
@staticmethod
|
||||
def cross_two(comb, df):
|
||||
new_col = f'{comb[0]}_{comb[1]}'
|
||||
new_col_combs = list(itertools.product(df[comb[0]].unique(), df[comb[1]].unique()))
|
||||
ll = list(range(len(new_col_combs)))
|
||||
comb_map = dict(zip(new_col_combs, ll))
|
||||
return new_col, comb_map
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
for col in self.cols:
|
||||
if df[col].nunique() > self.max_cat_num:
|
||||
self.cols.remove(col)
|
||||
self.combs = list(itertools.combinations(self.cols, 2))
|
||||
res = Parallel(n_jobs=4, require='sharedmem')(
|
||||
delayed(self.cross_two)(comb, df) for comb in self.combs)
|
||||
self.combs_map = dict(res)
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
for comb in self.combs:
|
||||
new_col = f'{comb[0]}_{comb[1]}'
|
||||
_map = self.combs_map[new_col]
|
||||
df[new_col] = pd.Series(zip(df[comb[0]], df[comb[1]])).map(_map)
|
||||
# set the unknown value to a new number
|
||||
df[new_col].fillna(max(_map.values()) + 1, inplace=True)
|
||||
df[new_col] = df[new_col].astype(int)
|
||||
return df
|
||||
|
||||
|
||||
class GroupStat(MLProcess):
|
||||
def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
|
||||
self.group_col = group_col
|
||||
self.agg_col = agg_col
|
||||
self.agg_funcs = agg_funcs
|
||||
self.group_df = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
group_df = df.groupby(self.group_col)[self.agg_col].agg(self.agg_funcs).reset_index()
|
||||
group_df.columns = [self.group_col] + [
|
||||
f"{self.agg_col}_{agg_func}_by_{self.group_col}" for agg_func in self.agg_funcs
|
||||
]
|
||||
self.group_df = group_df
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df.merge(self.group_df, on=self.group_col, how="left")
|
||||
return df
|
||||
|
||||
|
||||
class SplitBins(MLProcess):
|
||||
def __init__(self, cols: str, strategy: str = 'quantile'):
|
||||
self.cols = cols
|
||||
self.strategy = strategy
|
||||
self.encoder = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.encoder = KBinsDiscretizer(strategy=self.strategy, encode='ordinal')
|
||||
self.encoder.fit(df[self.cols].fillna(0))
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
|
||||
return df
|
||||
|
||||
# @registry.register("feature_engineering", ExtractTimeComps)
|
||||
# def extract_time_comps(df, time_col, time_comps):
|
||||
# time_s = pd.to_datetime(df[time_col], errors="coerce")
|
||||
# time_comps_df = pd.DataFrame()
|
||||
#
|
||||
# if "year" in time_comps:
|
||||
# time_comps_df["year"] = time_s.dt.year
|
||||
# if "month" in time_comps:
|
||||
# time_comps_df["month"] = time_s.dt.month
|
||||
# if "day" in time_comps:
|
||||
# time_comps_df["day"] = time_s.dt.day
|
||||
# if "hour" in time_comps:
|
||||
# time_comps_df["hour"] = time_s.dt.hour
|
||||
# if "dayofweek" in time_comps:
|
||||
# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
|
||||
# if "is_weekend" in time_comps:
|
||||
# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
|
||||
# df = pd.concat([df, time_comps_df], axis=1)
|
||||
# return df
|
||||
#
|
||||
#
|
||||
# @registry.register("feature_engineering", FeShiftByTime)
|
||||
# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
|
||||
# df[time_col] = pd.to_datetime(df[time_col])
|
||||
#
|
||||
# def shift_datetime(date, offset, unit):
|
||||
# if unit in ["year", "y", "Y"]:
|
||||
# return date + relativedelta(years=offset)
|
||||
# elif unit in ["month", "m", "M"]:
|
||||
# return date + relativedelta(months=offset)
|
||||
# elif unit in ["day", "d", "D"]:
|
||||
# return date + relativedelta(days=offset)
|
||||
# elif unit in ["week", "w", "W"]:
|
||||
# return date + relativedelta(weeks=offset)
|
||||
# elif unit in ["hour", "h", "H"]:
|
||||
# return date + relativedelta(hours=offset)
|
||||
# else:
|
||||
# return date
|
||||
#
|
||||
# def shift_by_time_on_key(
|
||||
# inner_df, time_col, group_col, shift_col, offset, unit, col_name
|
||||
# ):
|
||||
# inner_df = inner_df.drop_duplicates()
|
||||
# inner_df[time_col] = inner_df[time_col].map(
|
||||
# lambda x: shift_datetime(x, offset, unit)
|
||||
# )
|
||||
# inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
|
||||
# shift_col
|
||||
# ].mean()
|
||||
# inner_df.rename(columns={shift_col: col_name}, inplace=True)
|
||||
# return inner_df
|
||||
#
|
||||
# shift_df = df[[time_col, group_col, shift_col]].copy()
|
||||
# for period in periods:
|
||||
# new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
|
||||
# tmp = shift_by_time_on_key(
|
||||
# shift_df, time_col, group_col, shift_col, period, freq, new_col_name
|
||||
# )
|
||||
# df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
#
|
||||
# return df
|
||||
#
|
||||
#
|
||||
# @registry.register("feature_engineering", FeRollingByTime)
|
||||
# def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
|
||||
# df[time_col] = pd.to_datetime(df[time_col])
|
||||
#
|
||||
# def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
|
||||
# time_freq = {
|
||||
# "Y": [365 * offset, "D"],
|
||||
# "M": [30 * offset, "D"],
|
||||
# "D": [offset, "D"],
|
||||
# "W": [7 * offset, "D"],
|
||||
# "H": [offset, "h"],
|
||||
# }
|
||||
#
|
||||
# if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
|
||||
# raise ValueError(f"Invalid agg function: {agg_func}")
|
||||
#
|
||||
# rolling_feat = inner_df.rolling(
|
||||
# f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
|
||||
# )
|
||||
# rolling_feat = getattr(rolling_feat, agg_func)()
|
||||
# depth = df.columns.nlevels
|
||||
# rolling_feat = rolling_feat.stack(list(range(depth)))
|
||||
# rolling_feat.name = col_name
|
||||
# return rolling_feat
|
||||
#
|
||||
# rolling_df = df[[time_col, group_col, rolling_col]].copy()
|
||||
# for period in periods:
|
||||
# for func in agg_funcs:
|
||||
# new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
|
||||
# tmp = pd.pivot_table(
|
||||
# rolling_df,
|
||||
# index=time_col,
|
||||
# values=rolling_col,
|
||||
# columns=group_col,
|
||||
# )
|
||||
# tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
|
||||
# df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
#
|
||||
# return df
|
||||
|
||||
|
||||
class GeneralSelection(MLProcess):
|
||||
def __init__(self, label_col: str):
|
||||
self.label_col = label_col
|
||||
self.feats = []
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
feats = [f for f in df.columns if f != self.label_col]
|
||||
for col in df.columns:
|
||||
if df[col].isnull().sum() / df.shape[0] == 1:
|
||||
feats.remove(col)
|
||||
|
||||
if df[col].nunique() == 1:
|
||||
feats.remove(col)
|
||||
|
||||
if (
|
||||
df.loc[df[col] == np.inf].shape[0] != 0
|
||||
or df.loc[df[col] == np.inf].shape[0] != 0
|
||||
):
|
||||
feats.remove(col)
|
||||
self.feats = feats
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df[self.feats]
|
||||
return df
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue