update ml functions

2026-05-15 11:02:36 +02:00 · 2023-12-07 20:45:08 +08:00 · 2023-12-07 20:45:08 +08:00 · 7e343a100b
commit 7e343a100b
parent 21d97a23bb
4 changed files with 80 additions and 48 deletions
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@ -1,15 +1,12 @@
-
-import pandas as pd
 import numpy as np
-
 from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import KBinsDiscretizer
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MaxAbsScaler
-from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
+from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import StandardScaler

 from metagpt.tools.functions import registry
 from metagpt.tools.functions.schemas.data_preprocess import *
@ -21,13 +18,6 @@ def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean',
    return df


-# @registry.register("data_preprocess", FillMissingValue)
-# def label_encode(df: pd.DataFrame, features: list,):
-#     for col in features:
-#         df[col] = LabelEncoder().fit_transform(df[col])
-#     return df
-
-
@registry.register("data_preprocess", SplitBins)
 def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
    df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
@ -73,6 +63,17 @@ def ordinal_encode(df: pd.DataFrame, features: list,):
    return df


+@registry.register("data_preprocess", OneHotEncoding)
+def one_hot_encoding(df, cols):
+    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
+    ts_data = enc.fit_transform(df[cols])
+    new_columns = enc.get_feature_names_out(cols)
+    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
+    df.drop(cols, axis=1, inplace=True)
+    df = pd.concat([df, ts_data], axis=1)
+    return df
+
+
 if __name__ == '__main__':
    def run():
        V = {
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@ -8,7 +8,8 @@ import itertools

 from dateutil.relativedelta import relativedelta
 from pandas.api.types import is_numeric_dtype
-from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
+from sklearn.model_selection import KFold
+from sklearn.preprocessing import PolynomialFeatures

 from metagpt.tools.functions import registry
 from metagpt.tools.functions.schemas.feature_engineering import *
@ -29,17 +30,6 @@ def polynomial_expansion(df, cols, degree=2):
    return df


-@registry.register("feature_engineering", OneHotEncoding)
-def one_hot_encoding(df, cols):
-    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
-    ts_data = enc.fit_transform(df[cols])
-    new_columns = enc.get_feature_names_out(cols)
-    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
-    df.drop(cols, axis=1, inplace=True)
-    df = pd.concat([df, ts_data], axis=1)
-    return df
-
-
@registry.register("feature_engineering", FrequencyEncoding)
 def frequency_encoding(df, cols):
    for col in cols:
@ -48,6 +38,31 @@ def frequency_encoding(df, cols):
    return df


+@registry.register("feature_engineering", TargetMeanEncoder)
+def target_mean_encoder(df, col, label):
+    encoder_dict = df.groupby(col)[label].mean().to_dict()
+    df[f"{col}_target_mean"] = df[col].map(encoder_dict)
+    return df
+
+
+@registry.register("feature_engineering", KFoldTargetMeanEncoder)
+def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021):
+    tmp = df.copy()
+    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+
+    global_mean = tmp[label].mean()
+    col_name = f"{col}_kf_target_mean"
+    for trn_idx, val_idx in kf.split(tmp, tmp[label]):
+        _trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
+        tmp.loc[tmp.index[val_idx], col_name] = _val[col].map(
+            _trn.groupby(col)[label].mean()
+        )
+    tmp[col_name].fillna(global_mean, inplace=True)
+    encoder_dict = tmp.groupby(col)[col_name].mean().to_dict()
+    df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict)
+    return df
+
+
@registry.register("feature_engineering", CatCross)
 def cat_cross(df, cols, max_cat_num=100):
    for col in cols:
@ -56,7 +71,8 @@ def cat_cross(df, cols, max_cat_num=100):

    for col1, col2 in itertools.combinations(cols, 2):
        cross_col = f"{col1}_cross_{col2}"
-        df[cross_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
+        crossed = df[col1].astype(str) + "_" + df[col2].astype(str)
+        df[cross_col] = crossed.astype('category').cat.codes
    return df


--- a/metagpt/tools/functions/schemas/data_preprocess.py
+++ b/metagpt/tools/functions/schemas/data_preprocess.py
@ -8,14 +8,13 @@ class FillMissingValue(ToolSchema):
    """Completing missing values with simple strategies"""
    df: pd.DataFrame = tool_field(description="input dataframe")
    features: list = tool_field(description="columns to be processed")
-    strategy: str = tool_field(description="the imputation strategy", default='mean')
-    fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
-
-
-# class LabelEncode(ToolSchema):
-#     """Completing missing values with simple strategies"""
-#     df: pd.DataFrame = tool_field(description="input dataframe")
-#     features: list = tool_field(description="columns to be processed")
+    strategy: str = tool_field(
+        description="the imputation strategy",
+        default='mean',
+        enum=['mean', 'median', 'most_frequent', 'constant']
+    )
+    fill_value: int = tool_field(
+        description="fill_value is used to replace all occurrences of missing_values", default=None)


 class SplitBins(ToolSchema):
@ -60,3 +59,9 @@ class OrdinalEncode(ToolSchema):
    df: pd.DataFrame = tool_field(description="input dataframe")
    features: list = tool_field(description="columns to be processed")

+
+class OneHotEncoding(ToolSchema):
+    """Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
+
+    df: pd.DataFrame = tool_field(description="DataFrame to process.")
+    cols: list = tool_field(description="Categorical columns to be one-hot encoded and dropped.")
--- a/metagpt/tools/functions/schemas/feature_engineering.py
+++ b/metagpt/tools/functions/schemas/feature_engineering.py
@ -12,29 +12,39 @@ from metagpt.tools.functions.schemas.base import ToolSchema, tool_field


 class PolynomialExpansion(ToolSchema):
-    """Generate polynomial and interaction features from selected columns, excluding the bias column."""
+    """Add polynomial and interaction features from selected numeric columns, excluding the bias column."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    cols: list = tool_field(description="Columns for polynomial expansion.")
    degree: int = tool_field(description="Degree of polynomial features.", default=2)


-class OneHotEncoding(ToolSchema):
-    """Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    cols: list = tool_field(description="Categorical columns to be one-hot encoded.")
-
-
 class FrequencyEncoding(ToolSchema):
-    """Convert categorical columns to frequency encoding."""
+    """Add value counts of categorical columns as new features."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    cols: list = tool_field(description="Categorical columns to be frequency encoded.")


+class TargetMeanEncoder(ToolSchema):
+    """Encodes a categorical column by the mean of the label column, and adds the result as a new feature."""
+
+    df: pd.DataFrame = tool_field(description="DataFrame to process.")
+    col: str = tool_field(description="Column to be mean encoded.")
+    label: str = tool_field(description="Predicted label column.")
+
+
+class KFoldTargetMeanEncoder(ToolSchema):
+    """Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."""
+    df: pd.DataFrame = tool_field(description="DataFrame to process.")
+    col: str = tool_field(description="Column to be k-fold mean encoded.")
+    label: str = tool_field(description="Predicted label column.")
+    n_splits: int = tool_field(description="Number of splits for K-fold.", default=5)
+    random_state: int = tool_field(description="Random seed.", default=2021)
+
+
 class CatCross(ToolSchema):
-    """Create pairwise crossed features from categorical columns, joining values with '_'."""
+    """Add pairwise crossed features and convert them to numerical features."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    cols: list = tool_field(description="Columns to be pairwise crossed.")
@ -44,7 +54,7 @@ class CatCross(ToolSchema):


 class GroupStat(ToolSchema):
-    """Perform aggregation operations on a specified column grouped by certain categories."""
+    """Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    group_col: str = tool_field(description="Column used for grouping.")
@ -56,7 +66,7 @@ class GroupStat(ToolSchema):


 class ExtractTimeComps(ToolSchema):
-    """Extract specific time components from a designated time column in a DataFrame."""
+    """Extract and add specific time components as new features from a designated time column."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    time_col: str = tool_field(
@ -69,7 +79,7 @@ class ExtractTimeComps(ToolSchema):


 class FeShiftByTime(ToolSchema):
-    """Shift column values in a DataFrame based on specified time intervals."""
+    """Shift column values based on specified time intervals and add the resulting new features to the DataFrame. New features are named in the format of '<group_col>_<shift_col>_lag_<period>_<freq>'."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    time_col: str = tool_field(description="Column for time-based shifting.")