diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py index 8c70462ee..f1665b405 100644 --- a/metagpt/tools/functions/libs/data_preprocess.py +++ b/metagpt/tools/functions/libs/data_preprocess.py @@ -1,3 +1,5 @@ +import json + import numpy as np import pandas as pd from sklearn.impute import SimpleImputer @@ -20,10 +22,14 @@ class FillMissingValue(MLProcess): self.si = None def fit(self, df: pd.DataFrame): + if len(self.features) == 0: + return self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value) self.si.fit(df[self.features]) def transform(self, df: pd.DataFrame): + if len(self.features) == 0: + return df df[self.features] = self.si.transform(df[self.features]) return df @@ -122,11 +128,15 @@ class LabelEncode(MLProcess): self.le_encoders = [] def fit(self, df: pd.DataFrame): + if len(self.features) == 0: + return for col in self.features: le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown']) self.le_encoders.append(le) def transform(self, df: pd.DataFrame): + if len(self.features) == 0: + return df for i in range(len(self.features)): data_list = df[self.features[i]].astype(str).tolist() for unique_item in np.unique(df[self.features[i]].astype(str)): @@ -137,17 +147,23 @@ class LabelEncode(MLProcess): def get_column_info(df: pd.DataFrame) -> dict: - data = [] - for i in df.columns: - nan_freq = float("%.2g" % (df[i].isna().mean() * 100)) - n_unique = df[i].nunique() - data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "") - if data_type == "O": - data_type = "object" - data.append([i, data_type, nan_freq, n_unique]) + column_info = { + "Category": [], + "Numeric": [], + "Datetime": [], + "Others": [], + } + for col in df.columns: + data_type = str(df[col].dtype).replace("dtype('", "").replace("')", "") + if data_type.startswith("object"): + column_info["Category"].append(col) + elif data_type.startswith("int") or data_type.startswith("float"): + column_info["Numeric"].append(col) + elif data_type.startswith("datetime"): + column_info["Datetime"].append(col) + else: + column_info["Others"].append(col) - samples = pd.DataFrame( - data, - columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"], - ) - return samples.to_dict(orient='list') + if len(json.dumps(column_info)) > 2000: + column_info['Numeric'] = column_info['Numeric'][0:5] + ['Too many cols, omission here...'] + return column_info diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py index 1ec2b9675..df36752b9 100644 --- a/metagpt/tools/functions/libs/feature_engineering.py +++ b/metagpt/tools/functions/libs/feature_engineering.py @@ -6,12 +6,12 @@ # @Desc : Feature Engineering Tools import itertools +import lightgbm as lgb import numpy as np import pandas as pd -from dateutil.relativedelta import relativedelta from joblib import Parallel, delayed -from pandas.api.types import is_numeric_dtype from pandas.core.dtypes.common import is_object_dtype +from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import KFold from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer @@ -19,15 +19,27 @@ from metagpt.tools.functions.libs.base import MLProcess class PolynomialExpansion(MLProcess): - def __init__(self, cols: list, degree: int = 2): + def __init__(self, cols: list, degree: int = 2, label_col: str = None): self.cols = cols self.degree = degree + self.label_col = label_col + if self.label_col in self.cols: + self.cols.remove(self.label_col) self.poly = PolynomialFeatures(degree=degree, include_bias=False) def fit(self, df: pd.DataFrame): + if len(self.cols) == 0: + return + if len(self.cols) > 10: + corr = df[self.cols + [self.label_col]].corr() + corr = corr[self.label_col].abs().sort_values(ascending=False) + self.cols = corr.index.tolist()[1:11] + self.poly.fit(df[self.cols].fillna(0)) def transform(self, df: pd.DataFrame) -> pd.DataFrame: + if len(self.cols) == 0: + return df ts_data = self.poly.transform(df[self.cols].fillna(0)) column_name = self.poly.get_feature_names_out(self.cols) ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name) @@ -158,27 +170,35 @@ class SplitBins(MLProcess): df[self.cols] = self.encoder.transform(df[self.cols].fillna(0)) return df -# @registry.register("feature_engineering", ExtractTimeComps) -# def extract_time_comps(df, time_col, time_comps): -# time_s = pd.to_datetime(df[time_col], errors="coerce") -# time_comps_df = pd.DataFrame() -# -# if "year" in time_comps: -# time_comps_df["year"] = time_s.dt.year -# if "month" in time_comps: -# time_comps_df["month"] = time_s.dt.month -# if "day" in time_comps: -# time_comps_df["day"] = time_s.dt.day -# if "hour" in time_comps: -# time_comps_df["hour"] = time_s.dt.hour -# if "dayofweek" in time_comps: -# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1 -# if "is_weekend" in time_comps: -# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int) -# df = pd.concat([df, time_comps_df], axis=1) -# return df -# -# + +class ExtractTimeComps(MLProcess): + def __init__(self, time_col: str, time_comps: list): + self.time_col = time_col + self.time_comps = time_comps + + def fit(self, df: pd.DataFrame): + pass + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + time_s = pd.to_datetime(df[self.time_col], errors="coerce") + time_comps_df = pd.DataFrame() + + if "year" in self.time_comps: + time_comps_df["year"] = time_s.dt.year + if "month" in self.time_comps: + time_comps_df["month"] = time_s.dt.month + if "day" in self.time_comps: + time_comps_df["day"] = time_s.dt.day + if "hour" in self.time_comps: + time_comps_df["hour"] = time_s.dt.hour + if "dayofweek" in self.time_comps: + time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1 + if "is_weekend" in self.time_comps: + time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int) + df = pd.concat([df, time_comps_df], axis=1) + return df + + # @registry.register("feature_engineering", FeShiftByTime) # def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq): # df[time_col] = pd.to_datetime(df[time_col]) @@ -290,3 +310,66 @@ class GeneralSelection(MLProcess): def transform(self, df: pd.DataFrame) -> pd.DataFrame: df = df[self.feats + [self.label_col]] return df + + +class TreeBasedSelection(MLProcess): + def __init__(self, label_col: str, task_type: str): + self.label_col = label_col + self.task_type = task_type + self.feats = None + + def fit(self, df: pd.DataFrame): + params = { + 'boosting_type': 'gbdt', + 'objective': 'binary', + 'learning_rate': 0.1, + 'num_leaves': 31, + } + + if self.task_type == "cls": + params["objective"] = "binary" + params["metric"] = "auc" + elif self.task_type == "mcls": + params["objective"] = "multiclass" + params["num_class"] = df[self.label_col].nunique() + params["metric"] = "auc_mu" + elif self.task_type == "reg": + params["objective"] = "regression" + params["metric"] = "rmse" + + num_cols = df.select_dtypes(include=np.number).columns.tolist() + cols = [f for f in num_cols if f not in [self.label_col]] + + dtrain = lgb.Dataset(df[cols], df[self.label_col]) + model = lgb.train(params, dtrain, num_boost_round=100) + df_imp = pd.DataFrame({'feature_name': dtrain.feature_name, + 'importance': model.feature_importance("gain")}) + + df_imp.sort_values("importance", ascending=False, inplace=True) + df_imp = df_imp[df_imp["importance"] > 0] + self.feats = df_imp['feature_name'].tolist() + self.feats.append(self.label_col) + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df = df[self.feats] + return df + + +class VarianceBasedSelection(MLProcess): + def __init__(self, label_col: str, threshold: float = 0): + self.label_col = label_col + self.threshold = threshold + self.feats = None + self.selector = VarianceThreshold(threshold=self.threshold) + + def fit(self, df: pd.DataFrame): + num_cols = df.select_dtypes(include=np.number).columns.tolist() + cols = [f for f in num_cols if f not in [self.label_col]] + + self.selector.fit(df[cols]) + self.feats = df[cols].columns[self.selector.get_support(indices=True)].tolist() + self.feats.append(self.label_col) + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df = df[self.feats] + return df diff --git a/metagpt/tools/functions/schemas/data_preprocess.yml b/metagpt/tools/functions/schemas/data_preprocess.yml index 95b0124cc..4de697abd 100644 --- a/metagpt/tools/functions/schemas/data_preprocess.yml +++ b/metagpt/tools/functions/schemas/data_preprocess.yml @@ -11,7 +11,7 @@ FillMissingValue: description: "columns to be processed" strategy: type: str - description: "the imputation strategy" + description: "the imputation strategy, notice mean/median can only be used for numeric features" default: mean enum: - mean diff --git a/metagpt/tools/functions/schemas/feature_engineering.yml b/metagpt/tools/functions/schemas/feature_engineering.yml index 3ba9e863b..62e6ad5b3 100644 --- a/metagpt/tools/functions/schemas/feature_engineering.yml +++ b/metagpt/tools/functions/schemas/feature_engineering.yml @@ -1,6 +1,6 @@ PolynomialExpansion: type: class - description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column." + description: "Add polynomial and interaction features from selected numeric columns to input DataFrame." methods: __init__: description: "Initialize self." @@ -9,12 +9,16 @@ PolynomialExpansion: cols: type: list description: "Columns for polynomial expansion." + label_col: + type: str + description: "Label column name." degree: type: int description: "The degree of the polynomial features." default: 2 required: - cols + - label_col fit: description: "Fit the PolynomialExpansion model." parameters: @@ -36,14 +40,14 @@ PolynomialExpansion: returns: df: type: DataFrame - description: "The transformed DataFrame." + description: "The transformed DataFrame without duplicated columns." fit_transform: description: "Fit and transform the input DataFrame." parameters: properties: df: type: DataFrame - description: "The input DataFrame." + description: "The input DataFrame without duplicated columns." required: - df returns: @@ -224,7 +228,7 @@ CatCross: properties: cols: type: list - description: "Columns to be pairwise crossed." + description: "Columns to be pairwise crossed, at least 2 columns." max_cat_num: type: int description: "Maximum unique categories per crossed feature." @@ -430,4 +434,115 @@ GeneralSelection: returns: df: type: DataFrame - description: "The transformed DataFrame." \ No newline at end of file + description: "The transformed DataFrame." + + +TreeBasedSelection: + type: class + description: "Select features based on tree-based model and remove features with low importance." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + label_col: + type: str + description: "Label column name." + task_type: + type: str + description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression." + enum: + - cls + - mcls + - reg + required: + - label_col + - task_type + fit: + description: "Fit the TreeBasedSelection model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame contain label_col." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame contain label_col." + +VarianceBasedSelection: + type: class + description: "Select features based on variance and remove features with low variance." + methods: + __init__: + description: "Initialize self." + parameters: + properties: + label_col: + type: str + description: "Label column name." + threshold: + type: float + description: "Threshold for variance." + default: 0.0 + required: + - label_col + fit: + description: "Fit the VarianceBasedSelection model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + transform: + description: "Transform the input DataFrame with the fitted model." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame contain label_col." + fit_transform: + description: "Fit and transform the input DataFrame." + parameters: + properties: + df: + type: DataFrame + description: "The input DataFrame." + required: + - df + returns: + df: + type: DataFrame + description: "The transformed DataFrame contain label_col." \ No newline at end of file