From f614fbfa7c3e22e968bc4229271df092c3be9575 Mon Sep 17 00:00:00 2001
From: lidanyang <lidanyang@fuzhi.ai>
Date: Wed, 13 Dec 2023 13:37:40 +0800
Subject: [PATCH] update ml tools

---
 metagpt/tools/functions/libs/data_preprocess.py  | 16 +++-------------
 .../tools/functions/libs/feature_engineering.py  |  5 +++++
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py
index 39474b0fd..fa70bf8fc 100644
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@@ -1,6 +1,6 @@
 import numpy as np
 from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
+from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import MaxAbsScaler
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.preprocessing import OneHotEncoder
@@ -8,7 +8,6 @@ from sklearn.preprocessing import OrdinalEncoder
 from sklearn.preprocessing import RobustScaler
 from sklearn.preprocessing import StandardScaler
 
-from metagpt.tools.functions import registry
 from metagpt.tools.functions.libs.base import MLProcess
 from metagpt.tools.functions.schemas.data_preprocess import *
 
@@ -57,15 +56,6 @@ class StandardScale(MLProcess):
         return df
 
 
-@registry.register("data_preprocess", LogTransform)
-def log_transform(df: pd.DataFrame, features: list, ):
-    for col in features:
-        if df[col].min() <= 0:
-            df[col] = df[col] - df[col].min() + 2
-        df[col] = np.log(df[col])
-    return df
-
-
 class MaxAbsScale(MLProcess):
     def __init__(self, features: list,):
         self.features = features
@@ -146,7 +136,7 @@ class LabelEncode(MLProcess):
         return df
 
 
-def get_column_info(df: pd.DataFrame) -> str:
+def get_column_info(df: pd.DataFrame) -> dict:
     data = []
     for i in df.columns:
         nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
@@ -157,7 +147,7 @@ def get_column_info(df: pd.DataFrame) -> str:
         data,
         columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
     )
-    return samples.to_string(index=False)
+    return samples.to_dict(orient='list')
 #
 #
 # if __name__ == '__main__':
diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py
index 67247d0d1..de54e4db0 100644
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@@ -10,6 +10,7 @@ import numpy as np
 from dateutil.relativedelta import relativedelta
 from joblib import Parallel, delayed
 from pandas.api.types import is_numeric_dtype
+from pandas.core.dtypes.common import is_object_dtype
 from sklearn.model_selection import KFold
 from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
 
@@ -280,6 +281,10 @@ class GeneralSelection(MLProcess):
                 or df.loc[df[col] == np.inf].shape[0] != 0
             ):
                 feats.remove(col)
+
+            if is_object_dtype(df[col]) and df[col].nunique() == df.shape[0]:
+                feats.remove(col)
+
         self.feats = feats
 
     def transform(self, df: pd.DataFrame) -> pd.DataFrame: