From f614fbfa7c3e22e968bc4229271df092c3be9575 Mon Sep 17 00:00:00 2001 From: lidanyang Date: Wed, 13 Dec 2023 13:37:40 +0800 Subject: [PATCH] update ml tools --- metagpt/tools/functions/libs/data_preprocess.py | 16 +++------------- .../tools/functions/libs/feature_engineering.py | 5 +++++ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py index 39474b0fd..fa70bf8fc 100644 --- a/metagpt/tools/functions/libs/data_preprocess.py +++ b/metagpt/tools/functions/libs/data_preprocess.py @@ -1,6 +1,6 @@ import numpy as np from sklearn.impute import SimpleImputer -from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder +from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import MaxAbsScaler from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import OneHotEncoder @@ -8,7 +8,6 @@ from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import StandardScaler -from metagpt.tools.functions import registry from metagpt.tools.functions.libs.base import MLProcess from metagpt.tools.functions.schemas.data_preprocess import * @@ -57,15 +56,6 @@ class StandardScale(MLProcess): return df -@registry.register("data_preprocess", LogTransform) -def log_transform(df: pd.DataFrame, features: list, ): - for col in features: - if df[col].min() <= 0: - df[col] = df[col] - df[col].min() + 2 - df[col] = np.log(df[col]) - return df - - class MaxAbsScale(MLProcess): def __init__(self, features: list,): self.features = features @@ -146,7 +136,7 @@ class LabelEncode(MLProcess): return df -def get_column_info(df: pd.DataFrame) -> str: +def get_column_info(df: pd.DataFrame) -> dict: data = [] for i in df.columns: nan_freq = float("%.2g" % (df[i].isna().mean() * 100)) @@ -157,7 +147,7 @@ def get_column_info(df: pd.DataFrame) -> str: data, columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"], ) - return samples.to_string(index=False) + return samples.to_dict(orient='list') # # # if __name__ == '__main__': diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py index 67247d0d1..de54e4db0 100644 --- a/metagpt/tools/functions/libs/feature_engineering.py +++ b/metagpt/tools/functions/libs/feature_engineering.py @@ -10,6 +10,7 @@ import numpy as np from dateutil.relativedelta import relativedelta from joblib import Parallel, delayed from pandas.api.types import is_numeric_dtype +from pandas.core.dtypes.common import is_object_dtype from sklearn.model_selection import KFold from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer @@ -280,6 +281,10 @@ class GeneralSelection(MLProcess): or df.loc[df[col] == np.inf].shape[0] != 0 ): feats.remove(col) + + if is_object_dtype(df[col]) and df[col].nunique() == df.shape[0]: + feats.remove(col) + self.feats = feats def transform(self, df: pd.DataFrame) -> pd.DataFrame: