mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-15 11:02:36 +02:00
update ml functions
This commit is contained in:
parent
21d97a23bb
commit
7e343a100b
4 changed files with 80 additions and 48 deletions
|
|
@ -1,15 +1,12 @@
|
|||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.preprocessing import KBinsDiscretizer
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import MaxAbsScaler
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.preprocessing import OrdinalEncoder
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from metagpt.tools.functions import registry
|
||||
from metagpt.tools.functions.schemas.data_preprocess import *
|
||||
|
|
@ -21,13 +18,6 @@ def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean',
|
|||
return df
|
||||
|
||||
|
||||
# @registry.register("data_preprocess", FillMissingValue)
|
||||
# def label_encode(df: pd.DataFrame, features: list,):
|
||||
# for col in features:
|
||||
# df[col] = LabelEncoder().fit_transform(df[col])
|
||||
# return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", SplitBins)
|
||||
def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
|
||||
df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
|
||||
|
|
@ -73,6 +63,17 @@ def ordinal_encode(df: pd.DataFrame, features: list,):
|
|||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", OneHotEncoding)
|
||||
def one_hot_encoding(df, cols):
|
||||
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
ts_data = enc.fit_transform(df[cols])
|
||||
new_columns = enc.get_feature_names_out(cols)
|
||||
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
|
||||
df.drop(cols, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def run():
|
||||
V = {
|
||||
|
|
|
|||
|
|
@ -8,7 +8,8 @@ import itertools
|
|||
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.preprocessing import PolynomialFeatures
|
||||
|
||||
from metagpt.tools.functions import registry
|
||||
from metagpt.tools.functions.schemas.feature_engineering import *
|
||||
|
|
@ -29,17 +30,6 @@ def polynomial_expansion(df, cols, degree=2):
|
|||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", OneHotEncoding)
|
||||
def one_hot_encoding(df, cols):
|
||||
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
ts_data = enc.fit_transform(df[cols])
|
||||
new_columns = enc.get_feature_names_out(cols)
|
||||
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
|
||||
df.drop(cols, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", FrequencyEncoding)
|
||||
def frequency_encoding(df, cols):
|
||||
for col in cols:
|
||||
|
|
@ -48,6 +38,31 @@ def frequency_encoding(df, cols):
|
|||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", TargetMeanEncoder)
|
||||
def target_mean_encoder(df, col, label):
|
||||
encoder_dict = df.groupby(col)[label].mean().to_dict()
|
||||
df[f"{col}_target_mean"] = df[col].map(encoder_dict)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", KFoldTargetMeanEncoder)
|
||||
def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021):
|
||||
tmp = df.copy()
|
||||
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
|
||||
|
||||
global_mean = tmp[label].mean()
|
||||
col_name = f"{col}_kf_target_mean"
|
||||
for trn_idx, val_idx in kf.split(tmp, tmp[label]):
|
||||
_trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
|
||||
tmp.loc[tmp.index[val_idx], col_name] = _val[col].map(
|
||||
_trn.groupby(col)[label].mean()
|
||||
)
|
||||
tmp[col_name].fillna(global_mean, inplace=True)
|
||||
encoder_dict = tmp.groupby(col)[col_name].mean().to_dict()
|
||||
df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", CatCross)
|
||||
def cat_cross(df, cols, max_cat_num=100):
|
||||
for col in cols:
|
||||
|
|
@ -56,7 +71,8 @@ def cat_cross(df, cols, max_cat_num=100):
|
|||
|
||||
for col1, col2 in itertools.combinations(cols, 2):
|
||||
cross_col = f"{col1}_cross_{col2}"
|
||||
df[cross_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
|
||||
crossed = df[col1].astype(str) + "_" + df[col2].astype(str)
|
||||
df[cross_col] = crossed.astype('category').cat.codes
|
||||
return df
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -8,14 +8,13 @@ class FillMissingValue(ToolSchema):
|
|||
"""Completing missing values with simple strategies"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
strategy: str = tool_field(description="the imputation strategy", default='mean')
|
||||
fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
|
||||
|
||||
|
||||
# class LabelEncode(ToolSchema):
|
||||
# """Completing missing values with simple strategies"""
|
||||
# df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
# features: list = tool_field(description="columns to be processed")
|
||||
strategy: str = tool_field(
|
||||
description="the imputation strategy",
|
||||
default='mean',
|
||||
enum=['mean', 'median', 'most_frequent', 'constant']
|
||||
)
|
||||
fill_value: int = tool_field(
|
||||
description="fill_value is used to replace all occurrences of missing_values", default=None)
|
||||
|
||||
|
||||
class SplitBins(ToolSchema):
|
||||
|
|
@ -60,3 +59,9 @@ class OrdinalEncode(ToolSchema):
|
|||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class OneHotEncoding(ToolSchema):
|
||||
"""Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
cols: list = tool_field(description="Categorical columns to be one-hot encoded and dropped.")
|
||||
|
|
|
|||
|
|
@ -12,29 +12,39 @@ from metagpt.tools.functions.schemas.base import ToolSchema, tool_field
|
|||
|
||||
|
||||
class PolynomialExpansion(ToolSchema):
|
||||
"""Generate polynomial and interaction features from selected columns, excluding the bias column."""
|
||||
"""Add polynomial and interaction features from selected numeric columns, excluding the bias column."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
cols: list = tool_field(description="Columns for polynomial expansion.")
|
||||
degree: int = tool_field(description="Degree of polynomial features.", default=2)
|
||||
|
||||
|
||||
class OneHotEncoding(ToolSchema):
|
||||
"""Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
cols: list = tool_field(description="Categorical columns to be one-hot encoded.")
|
||||
|
||||
|
||||
class FrequencyEncoding(ToolSchema):
|
||||
"""Convert categorical columns to frequency encoding."""
|
||||
"""Add value counts of categorical columns as new features."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
cols: list = tool_field(description="Categorical columns to be frequency encoded.")
|
||||
|
||||
|
||||
class TargetMeanEncoder(ToolSchema):
|
||||
"""Encodes a categorical column by the mean of the label column, and adds the result as a new feature."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
col: str = tool_field(description="Column to be mean encoded.")
|
||||
label: str = tool_field(description="Predicted label column.")
|
||||
|
||||
|
||||
class KFoldTargetMeanEncoder(ToolSchema):
|
||||
"""Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."""
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
col: str = tool_field(description="Column to be k-fold mean encoded.")
|
||||
label: str = tool_field(description="Predicted label column.")
|
||||
n_splits: int = tool_field(description="Number of splits for K-fold.", default=5)
|
||||
random_state: int = tool_field(description="Random seed.", default=2021)
|
||||
|
||||
|
||||
class CatCross(ToolSchema):
|
||||
"""Create pairwise crossed features from categorical columns, joining values with '_'."""
|
||||
"""Add pairwise crossed features and convert them to numerical features."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
cols: list = tool_field(description="Columns to be pairwise crossed.")
|
||||
|
|
@ -44,7 +54,7 @@ class CatCross(ToolSchema):
|
|||
|
||||
|
||||
class GroupStat(ToolSchema):
|
||||
"""Perform aggregation operations on a specified column grouped by certain categories."""
|
||||
"""Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
group_col: str = tool_field(description="Column used for grouping.")
|
||||
|
|
@ -56,7 +66,7 @@ class GroupStat(ToolSchema):
|
|||
|
||||
|
||||
class ExtractTimeComps(ToolSchema):
|
||||
"""Extract specific time components from a designated time column in a DataFrame."""
|
||||
"""Extract and add specific time components as new features from a designated time column."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
time_col: str = tool_field(
|
||||
|
|
@ -69,7 +79,7 @@ class ExtractTimeComps(ToolSchema):
|
|||
|
||||
|
||||
class FeShiftByTime(ToolSchema):
|
||||
"""Shift column values in a DataFrame based on specified time intervals."""
|
||||
"""Shift column values based on specified time intervals and add the resulting new features to the DataFrame. New features are named in the format of '<group_col>_<shift_col>_lag_<period>_<freq>'."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
time_col: str = tool_field(description="Column for time-based shifting.")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue