From 7e343a100b449a8441ab55063ad76661d0391f46 Mon Sep 17 00:00:00 2001 From: lidanyang Date: Thu, 7 Dec 2023 20:45:08 +0800 Subject: [PATCH] update ml functions --- .../tools/functions/libs/data_preprocess.py | 29 ++++++------- .../functions/libs/feature_engineering.py | 42 +++++++++++++------ .../functions/schemas/data_preprocess.py | 21 ++++++---- .../functions/schemas/feature_engineering.py | 36 ++++++++++------ 4 files changed, 80 insertions(+), 48 deletions(-) diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py index 68c96bbc9..5579c5bd8 100644 --- a/metagpt/tools/functions/libs/data_preprocess.py +++ b/metagpt/tools/functions/libs/data_preprocess.py @@ -1,15 +1,12 @@ - -import pandas as pd import numpy as np - from sklearn.impute import SimpleImputer -from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import KBinsDiscretizer -from sklearn.preprocessing import MinMaxScaler -from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MaxAbsScaler -from sklearn.preprocessing import RobustScaler +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import OrdinalEncoder +from sklearn.preprocessing import RobustScaler +from sklearn.preprocessing import StandardScaler from metagpt.tools.functions import registry from metagpt.tools.functions.schemas.data_preprocess import * @@ -21,13 +18,6 @@ def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', return df -# @registry.register("data_preprocess", FillMissingValue) -# def label_encode(df: pd.DataFrame, features: list,): -# for col in features: -# df[col] = LabelEncoder().fit_transform(df[col]) -# return df - - @registry.register("data_preprocess", SplitBins) def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',): df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features]) @@ -73,6 +63,17 @@ def ordinal_encode(df: pd.DataFrame, features: list,): return df +@registry.register("data_preprocess", OneHotEncoding) +def one_hot_encoding(df, cols): + enc = OneHotEncoder(handle_unknown="ignore", sparse=False) + ts_data = enc.fit_transform(df[cols]) + new_columns = enc.get_feature_names_out(cols) + ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index) + df.drop(cols, axis=1, inplace=True) + df = pd.concat([df, ts_data], axis=1) + return df + + if __name__ == '__main__': def run(): V = { diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py index 0573f362d..4780e4fa0 100644 --- a/metagpt/tools/functions/libs/feature_engineering.py +++ b/metagpt/tools/functions/libs/feature_engineering.py @@ -8,7 +8,8 @@ import itertools from dateutil.relativedelta import relativedelta from pandas.api.types import is_numeric_dtype -from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder +from sklearn.model_selection import KFold +from sklearn.preprocessing import PolynomialFeatures from metagpt.tools.functions import registry from metagpt.tools.functions.schemas.feature_engineering import * @@ -29,17 +30,6 @@ def polynomial_expansion(df, cols, degree=2): return df -@registry.register("feature_engineering", OneHotEncoding) -def one_hot_encoding(df, cols): - enc = OneHotEncoder(handle_unknown="ignore", sparse=False) - ts_data = enc.fit_transform(df[cols]) - new_columns = enc.get_feature_names_out(cols) - ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index) - df.drop(cols, axis=1, inplace=True) - df = pd.concat([df, ts_data], axis=1) - return df - - @registry.register("feature_engineering", FrequencyEncoding) def frequency_encoding(df, cols): for col in cols: @@ -48,6 +38,31 @@ def frequency_encoding(df, cols): return df +@registry.register("feature_engineering", TargetMeanEncoder) +def target_mean_encoder(df, col, label): + encoder_dict = df.groupby(col)[label].mean().to_dict() + df[f"{col}_target_mean"] = df[col].map(encoder_dict) + return df + + +@registry.register("feature_engineering", KFoldTargetMeanEncoder) +def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021): + tmp = df.copy() + kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) + + global_mean = tmp[label].mean() + col_name = f"{col}_kf_target_mean" + for trn_idx, val_idx in kf.split(tmp, tmp[label]): + _trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx] + tmp.loc[tmp.index[val_idx], col_name] = _val[col].map( + _trn.groupby(col)[label].mean() + ) + tmp[col_name].fillna(global_mean, inplace=True) + encoder_dict = tmp.groupby(col)[col_name].mean().to_dict() + df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict) + return df + + @registry.register("feature_engineering", CatCross) def cat_cross(df, cols, max_cat_num=100): for col in cols: @@ -56,7 +71,8 @@ def cat_cross(df, cols, max_cat_num=100): for col1, col2 in itertools.combinations(cols, 2): cross_col = f"{col1}_cross_{col2}" - df[cross_col] = df[col1].astype(str) + "_" + df[col2].astype(str) + crossed = df[col1].astype(str) + "_" + df[col2].astype(str) + df[cross_col] = crossed.astype('category').cat.codes return df diff --git a/metagpt/tools/functions/schemas/data_preprocess.py b/metagpt/tools/functions/schemas/data_preprocess.py index 40e1d64e0..16b97aeac 100644 --- a/metagpt/tools/functions/schemas/data_preprocess.py +++ b/metagpt/tools/functions/schemas/data_preprocess.py @@ -8,14 +8,13 @@ class FillMissingValue(ToolSchema): """Completing missing values with simple strategies""" df: pd.DataFrame = tool_field(description="input dataframe") features: list = tool_field(description="columns to be processed") - strategy: str = tool_field(description="the imputation strategy", default='mean') - fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None) - - -# class LabelEncode(ToolSchema): -# """Completing missing values with simple strategies""" -# df: pd.DataFrame = tool_field(description="input dataframe") -# features: list = tool_field(description="columns to be processed") + strategy: str = tool_field( + description="the imputation strategy", + default='mean', + enum=['mean', 'median', 'most_frequent', 'constant'] + ) + fill_value: int = tool_field( + description="fill_value is used to replace all occurrences of missing_values", default=None) class SplitBins(ToolSchema): @@ -60,3 +59,9 @@ class OrdinalEncode(ToolSchema): df: pd.DataFrame = tool_field(description="input dataframe") features: list = tool_field(description="columns to be processed") + +class OneHotEncoding(ToolSchema): + """Apply one-hot encoding to specified categorical columns, the original columns will be dropped.""" + + df: pd.DataFrame = tool_field(description="DataFrame to process.") + cols: list = tool_field(description="Categorical columns to be one-hot encoded and dropped.") diff --git a/metagpt/tools/functions/schemas/feature_engineering.py b/metagpt/tools/functions/schemas/feature_engineering.py index df2eebff6..5c89d9b16 100644 --- a/metagpt/tools/functions/schemas/feature_engineering.py +++ b/metagpt/tools/functions/schemas/feature_engineering.py @@ -12,29 +12,39 @@ from metagpt.tools.functions.schemas.base import ToolSchema, tool_field class PolynomialExpansion(ToolSchema): - """Generate polynomial and interaction features from selected columns, excluding the bias column.""" + """Add polynomial and interaction features from selected numeric columns, excluding the bias column.""" df: pd.DataFrame = tool_field(description="DataFrame to process.") cols: list = tool_field(description="Columns for polynomial expansion.") degree: int = tool_field(description="Degree of polynomial features.", default=2) -class OneHotEncoding(ToolSchema): - """Apply one-hot encoding to specified categorical columns, the original columns will be dropped.""" - - df: pd.DataFrame = tool_field(description="DataFrame to process.") - cols: list = tool_field(description="Categorical columns to be one-hot encoded.") - - class FrequencyEncoding(ToolSchema): - """Convert categorical columns to frequency encoding.""" + """Add value counts of categorical columns as new features.""" df: pd.DataFrame = tool_field(description="DataFrame to process.") cols: list = tool_field(description="Categorical columns to be frequency encoded.") +class TargetMeanEncoder(ToolSchema): + """Encodes a categorical column by the mean of the label column, and adds the result as a new feature.""" + + df: pd.DataFrame = tool_field(description="DataFrame to process.") + col: str = tool_field(description="Column to be mean encoded.") + label: str = tool_field(description="Predicted label column.") + + +class KFoldTargetMeanEncoder(ToolSchema): + """Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column.""" + df: pd.DataFrame = tool_field(description="DataFrame to process.") + col: str = tool_field(description="Column to be k-fold mean encoded.") + label: str = tool_field(description="Predicted label column.") + n_splits: int = tool_field(description="Number of splits for K-fold.", default=5) + random_state: int = tool_field(description="Random seed.", default=2021) + + class CatCross(ToolSchema): - """Create pairwise crossed features from categorical columns, joining values with '_'.""" + """Add pairwise crossed features and convert them to numerical features.""" df: pd.DataFrame = tool_field(description="DataFrame to process.") cols: list = tool_field(description="Columns to be pairwise crossed.") @@ -44,7 +54,7 @@ class CatCross(ToolSchema): class GroupStat(ToolSchema): - """Perform aggregation operations on a specified column grouped by certain categories.""" + """Aggregate specified column in a DataFrame grouped by another column, adding new features named '__by_'.""" df: pd.DataFrame = tool_field(description="DataFrame to process.") group_col: str = tool_field(description="Column used for grouping.") @@ -56,7 +66,7 @@ class GroupStat(ToolSchema): class ExtractTimeComps(ToolSchema): - """Extract specific time components from a designated time column in a DataFrame.""" + """Extract and add specific time components as new features from a designated time column.""" df: pd.DataFrame = tool_field(description="DataFrame to process.") time_col: str = tool_field( @@ -69,7 +79,7 @@ class ExtractTimeComps(ToolSchema): class FeShiftByTime(ToolSchema): - """Shift column values in a DataFrame based on specified time intervals.""" + """Shift column values based on specified time intervals and add the resulting new features to the DataFrame. New features are named in the format of '__lag__'.""" df: pd.DataFrame = tool_field(description="DataFrame to process.") time_col: str = tool_field(description="Column for time-based shifting.")