Merge branch 'dev' into 'dev_make_tools'

# Conflicts:
#   metagpt/roles/ml_engineer.py
This commit is contained in:
刘棒棒 2023-12-26 06:18:27 +00:00
commit c76c1765ef
6 changed files with 276 additions and 55 deletions

View file

@ -1,3 +1,5 @@
import json
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
@ -20,10 +22,14 @@ class FillMissingValue(MLProcess):
self.si = None
def fit(self, df: pd.DataFrame):
if len(self.features) == 0:
return
self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
self.si.fit(df[self.features])
def transform(self, df: pd.DataFrame):
if len(self.features) == 0:
return df
df[self.features] = self.si.transform(df[self.features])
return df
@ -122,11 +128,15 @@ class LabelEncode(MLProcess):
self.le_encoders = []
def fit(self, df: pd.DataFrame):
if len(self.features) == 0:
return
for col in self.features:
le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
self.le_encoders.append(le)
def transform(self, df: pd.DataFrame):
if len(self.features) == 0:
return df
for i in range(len(self.features)):
data_list = df[self.features[i]].astype(str).tolist()
for unique_item in np.unique(df[self.features[i]].astype(str)):
@ -137,17 +147,23 @@ class LabelEncode(MLProcess):
def get_column_info(df: pd.DataFrame) -> dict:
data = []
for i in df.columns:
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
n_unique = df[i].nunique()
data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "")
if data_type == "O":
data_type = "object"
data.append([i, data_type, nan_freq, n_unique])
column_info = {
"Category": [],
"Numeric": [],
"Datetime": [],
"Others": [],
}
for col in df.columns:
data_type = str(df[col].dtype).replace("dtype('", "").replace("')", "")
if data_type.startswith("object"):
column_info["Category"].append(col)
elif data_type.startswith("int") or data_type.startswith("float"):
column_info["Numeric"].append(col)
elif data_type.startswith("datetime"):
column_info["Datetime"].append(col)
else:
column_info["Others"].append(col)
samples = pd.DataFrame(
data,
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
)
return samples.to_dict(orient='list')
if len(json.dumps(column_info)) > 2000:
column_info['Numeric'] = column_info['Numeric'][0:5] + ['Too many cols, omission here...']
return column_info

View file

@ -6,12 +6,12 @@
# @Desc : Feature Engineering Tools
import itertools
import lightgbm as lgb
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from pandas.api.types import is_numeric_dtype
from pandas.core.dtypes.common import is_object_dtype
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
@ -19,15 +19,27 @@ from metagpt.tools.functions.libs.base import MLProcess
class PolynomialExpansion(MLProcess):
def __init__(self, cols: list, degree: int = 2):
def __init__(self, cols: list, degree: int = 2, label_col: str = None):
self.cols = cols
self.degree = degree
self.label_col = label_col
if self.label_col in self.cols:
self.cols.remove(self.label_col)
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
def fit(self, df: pd.DataFrame):
if len(self.cols) == 0:
return
if len(self.cols) > 10:
corr = df[self.cols + [self.label_col]].corr()
corr = corr[self.label_col].abs().sort_values(ascending=False)
self.cols = corr.index.tolist()[1:11]
self.poly.fit(df[self.cols].fillna(0))
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
if len(self.cols) == 0:
return df
ts_data = self.poly.transform(df[self.cols].fillna(0))
column_name = self.poly.get_feature_names_out(self.cols)
ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
@ -158,27 +170,35 @@ class SplitBins(MLProcess):
df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
return df
# @registry.register("feature_engineering", ExtractTimeComps)
# def extract_time_comps(df, time_col, time_comps):
# time_s = pd.to_datetime(df[time_col], errors="coerce")
# time_comps_df = pd.DataFrame()
#
# if "year" in time_comps:
# time_comps_df["year"] = time_s.dt.year
# if "month" in time_comps:
# time_comps_df["month"] = time_s.dt.month
# if "day" in time_comps:
# time_comps_df["day"] = time_s.dt.day
# if "hour" in time_comps:
# time_comps_df["hour"] = time_s.dt.hour
# if "dayofweek" in time_comps:
# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
# if "is_weekend" in time_comps:
# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
# df = pd.concat([df, time_comps_df], axis=1)
# return df
#
#
class ExtractTimeComps(MLProcess):
def __init__(self, time_col: str, time_comps: list):
self.time_col = time_col
self.time_comps = time_comps
def fit(self, df: pd.DataFrame):
pass
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
time_s = pd.to_datetime(df[self.time_col], errors="coerce")
time_comps_df = pd.DataFrame()
if "year" in self.time_comps:
time_comps_df["year"] = time_s.dt.year
if "month" in self.time_comps:
time_comps_df["month"] = time_s.dt.month
if "day" in self.time_comps:
time_comps_df["day"] = time_s.dt.day
if "hour" in self.time_comps:
time_comps_df["hour"] = time_s.dt.hour
if "dayofweek" in self.time_comps:
time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
if "is_weekend" in self.time_comps:
time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
df = pd.concat([df, time_comps_df], axis=1)
return df
# @registry.register("feature_engineering", FeShiftByTime)
# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
# df[time_col] = pd.to_datetime(df[time_col])
@ -290,3 +310,66 @@ class GeneralSelection(MLProcess):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[self.feats + [self.label_col]]
return df
class TreeBasedSelection(MLProcess):
def __init__(self, label_col: str, task_type: str):
self.label_col = label_col
self.task_type = task_type
self.feats = None
def fit(self, df: pd.DataFrame):
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'learning_rate': 0.1,
'num_leaves': 31,
}
if self.task_type == "cls":
params["objective"] = "binary"
params["metric"] = "auc"
elif self.task_type == "mcls":
params["objective"] = "multiclass"
params["num_class"] = df[self.label_col].nunique()
params["metric"] = "auc_mu"
elif self.task_type == "reg":
params["objective"] = "regression"
params["metric"] = "rmse"
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cols = [f for f in num_cols if f not in [self.label_col]]
dtrain = lgb.Dataset(df[cols], df[self.label_col])
model = lgb.train(params, dtrain, num_boost_round=100)
df_imp = pd.DataFrame({'feature_name': dtrain.feature_name,
'importance': model.feature_importance("gain")})
df_imp.sort_values("importance", ascending=False, inplace=True)
df_imp = df_imp[df_imp["importance"] > 0]
self.feats = df_imp['feature_name'].tolist()
self.feats.append(self.label_col)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[self.feats]
return df
class VarianceBasedSelection(MLProcess):
def __init__(self, label_col: str, threshold: float = 0):
self.label_col = label_col
self.threshold = threshold
self.feats = None
self.selector = VarianceThreshold(threshold=self.threshold)
def fit(self, df: pd.DataFrame):
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cols = [f for f in num_cols if f not in [self.label_col]]
self.selector.fit(df[cols])
self.feats = df[cols].columns[self.selector.get_support(indices=True)].tolist()
self.feats.append(self.label_col)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[self.feats]
return df

View file

@ -11,7 +11,7 @@ FillMissingValue:
description: "columns to be processed"
strategy:
type: str
description: "the imputation strategy"
description: "the imputation strategy, notice mean/median can only be used for numeric features"
default: mean
enum:
- mean

View file

@ -1,6 +1,6 @@
PolynomialExpansion:
type: class
description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column."
description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
methods:
__init__:
description: "Initialize self."
@ -9,12 +9,16 @@ PolynomialExpansion:
cols:
type: list
description: "Columns for polynomial expansion."
label_col:
type: str
description: "Label column name."
degree:
type: int
description: "The degree of the polynomial features."
default: 2
required:
- cols
- label_col
fit:
description: "Fit the PolynomialExpansion model."
parameters:
@ -36,14 +40,14 @@ PolynomialExpansion:
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
description: "The transformed DataFrame without duplicated columns."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
description: "The input DataFrame without duplicated columns."
required:
- df
returns:
@ -224,7 +228,7 @@ CatCross:
properties:
cols:
type: list
description: "Columns to be pairwise crossed."
description: "Columns to be pairwise crossed, at least 2 columns."
max_cat_num:
type: int
description: "Maximum unique categories per crossed feature."
@ -430,4 +434,115 @@ GeneralSelection:
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
description: "The transformed DataFrame."
TreeBasedSelection:
type: class
description: "Select features based on tree-based model and remove features with low importance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
task_type:
type: str
description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
enum:
- cls
- mcls
- reg
required:
- label_col
- task_type
fit:
description: "Fit the TreeBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
VarianceBasedSelection:
type: class
description: "Select features based on variance and remove features with low variance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
threshold:
type: float
description: "Threshold for variance."
default: 0.0
required:
- label_col
fit:
description: "Fit the VarianceBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."