update ml functions

This commit is contained in:
lidanyang 2023-12-07 20:45:08 +08:00
parent 21d97a23bb
commit 7e343a100b
4 changed files with 80 additions and 48 deletions

View file

@ -1,15 +1,12 @@
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.data_preprocess import *
@ -21,13 +18,6 @@ def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean',
return df
# @registry.register("data_preprocess", FillMissingValue)
# def label_encode(df: pd.DataFrame, features: list,):
# for col in features:
# df[col] = LabelEncoder().fit_transform(df[col])
# return df
@registry.register("data_preprocess", SplitBins)
def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
@ -73,6 +63,17 @@ def ordinal_encode(df: pd.DataFrame, features: list,):
return df
@registry.register("data_preprocess", OneHotEncoding)
def one_hot_encoding(df, cols):
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
ts_data = enc.fit_transform(df[cols])
new_columns = enc.get_feature_names_out(cols)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
df.drop(cols, axis=1, inplace=True)
df = pd.concat([df, ts_data], axis=1)
return df
if __name__ == '__main__':
def run():
V = {

View file

@ -8,7 +8,8 @@ import itertools
from dateutil.relativedelta import relativedelta
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.feature_engineering import *
@ -29,17 +30,6 @@ def polynomial_expansion(df, cols, degree=2):
return df
@registry.register("feature_engineering", OneHotEncoding)
def one_hot_encoding(df, cols):
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
ts_data = enc.fit_transform(df[cols])
new_columns = enc.get_feature_names_out(cols)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
df.drop(cols, axis=1, inplace=True)
df = pd.concat([df, ts_data], axis=1)
return df
@registry.register("feature_engineering", FrequencyEncoding)
def frequency_encoding(df, cols):
for col in cols:
@ -48,6 +38,31 @@ def frequency_encoding(df, cols):
return df
@registry.register("feature_engineering", TargetMeanEncoder)
def target_mean_encoder(df, col, label):
encoder_dict = df.groupby(col)[label].mean().to_dict()
df[f"{col}_target_mean"] = df[col].map(encoder_dict)
return df
@registry.register("feature_engineering", KFoldTargetMeanEncoder)
def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021):
tmp = df.copy()
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
global_mean = tmp[label].mean()
col_name = f"{col}_kf_target_mean"
for trn_idx, val_idx in kf.split(tmp, tmp[label]):
_trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
tmp.loc[tmp.index[val_idx], col_name] = _val[col].map(
_trn.groupby(col)[label].mean()
)
tmp[col_name].fillna(global_mean, inplace=True)
encoder_dict = tmp.groupby(col)[col_name].mean().to_dict()
df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict)
return df
@registry.register("feature_engineering", CatCross)
def cat_cross(df, cols, max_cat_num=100):
for col in cols:
@ -56,7 +71,8 @@ def cat_cross(df, cols, max_cat_num=100):
for col1, col2 in itertools.combinations(cols, 2):
cross_col = f"{col1}_cross_{col2}"
df[cross_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
crossed = df[col1].astype(str) + "_" + df[col2].astype(str)
df[cross_col] = crossed.astype('category').cat.codes
return df

View file

@ -8,14 +8,13 @@ class FillMissingValue(ToolSchema):
"""Completing missing values with simple strategies"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(description="the imputation strategy", default='mean')
fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
# class LabelEncode(ToolSchema):
# """Completing missing values with simple strategies"""
# df: pd.DataFrame = tool_field(description="input dataframe")
# features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(
description="the imputation strategy",
default='mean',
enum=['mean', 'median', 'most_frequent', 'constant']
)
fill_value: int = tool_field(
description="fill_value is used to replace all occurrences of missing_values", default=None)
class SplitBins(ToolSchema):
@ -60,3 +59,9 @@ class OrdinalEncode(ToolSchema):
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class OneHotEncoding(ToolSchema):
"""Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Categorical columns to be one-hot encoded and dropped.")

View file

@ -12,29 +12,39 @@ from metagpt.tools.functions.schemas.base import ToolSchema, tool_field
class PolynomialExpansion(ToolSchema):
"""Generate polynomial and interaction features from selected columns, excluding the bias column."""
"""Add polynomial and interaction features from selected numeric columns, excluding the bias column."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Columns for polynomial expansion.")
degree: int = tool_field(description="Degree of polynomial features.", default=2)
class OneHotEncoding(ToolSchema):
"""Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Categorical columns to be one-hot encoded.")
class FrequencyEncoding(ToolSchema):
"""Convert categorical columns to frequency encoding."""
"""Add value counts of categorical columns as new features."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Categorical columns to be frequency encoded.")
class TargetMeanEncoder(ToolSchema):
"""Encodes a categorical column by the mean of the label column, and adds the result as a new feature."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
col: str = tool_field(description="Column to be mean encoded.")
label: str = tool_field(description="Predicted label column.")
class KFoldTargetMeanEncoder(ToolSchema):
"""Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
col: str = tool_field(description="Column to be k-fold mean encoded.")
label: str = tool_field(description="Predicted label column.")
n_splits: int = tool_field(description="Number of splits for K-fold.", default=5)
random_state: int = tool_field(description="Random seed.", default=2021)
class CatCross(ToolSchema):
"""Create pairwise crossed features from categorical columns, joining values with '_'."""
"""Add pairwise crossed features and convert them to numerical features."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Columns to be pairwise crossed.")
@ -44,7 +54,7 @@ class CatCross(ToolSchema):
class GroupStat(ToolSchema):
"""Perform aggregation operations on a specified column grouped by certain categories."""
"""Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
group_col: str = tool_field(description="Column used for grouping.")
@ -56,7 +66,7 @@ class GroupStat(ToolSchema):
class ExtractTimeComps(ToolSchema):
"""Extract specific time components from a designated time column in a DataFrame."""
"""Extract and add specific time components as new features from a designated time column."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
time_col: str = tool_field(
@ -69,7 +79,7 @@ class ExtractTimeComps(ToolSchema):
class FeShiftByTime(ToolSchema):
"""Shift column values in a DataFrame based on specified time intervals."""
"""Shift column values based on specified time intervals and add the resulting new features to the DataFrame. New features are named in the format of '<group_col>_<shift_col>_lag_<period>_<freq>'."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
time_col: str = tool_field(description="Column for time-based shifting.")