update ml tools

2026-06-20 15:38:09 +02:00 · 2023-12-13 13:37:40 +08:00 · 2023-12-13 13:37:40 +08:00 · f614fbfa7c
commit f614fbfa7c
parent 49779d8615
2 changed files with 8 additions and 13 deletions
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@ -1,6 +1,6 @@
 import numpy as np
 from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
+from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import MaxAbsScaler
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.preprocessing import OneHotEncoder
@ -8,7 +8,6 @@ from sklearn.preprocessing import OrdinalEncoder
 from sklearn.preprocessing import RobustScaler
 from sklearn.preprocessing import StandardScaler

-from metagpt.tools.functions import registry
 from metagpt.tools.functions.libs.base import MLProcess
 from metagpt.tools.functions.schemas.data_preprocess import *

@ -57,15 +56,6 @@ class StandardScale(MLProcess):
        return df


-@registry.register("data_preprocess", LogTransform)
-def log_transform(df: pd.DataFrame, features: list, ):
-    for col in features:
-        if df[col].min() <= 0:
-            df[col] = df[col] - df[col].min() + 2
-        df[col] = np.log(df[col])
-    return df
-
-
 class MaxAbsScale(MLProcess):
    def __init__(self, features: list,):
        self.features = features
@ -146,7 +136,7 @@ class LabelEncode(MLProcess):
        return df


-def get_column_info(df: pd.DataFrame) -> str:
+def get_column_info(df: pd.DataFrame) -> dict:
    data = []
    for i in df.columns:
        nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
@ -157,7 +147,7 @@ def get_column_info(df: pd.DataFrame) -> str:
        data,
        columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
    )
-    return samples.to_string(index=False)
+    return samples.to_dict(orient='list')
 #
 #
 # if __name__ == '__main__':
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@ -10,6 +10,7 @@ import numpy as np
 from dateutil.relativedelta import relativedelta
 from joblib import Parallel, delayed
 from pandas.api.types import is_numeric_dtype
+from pandas.core.dtypes.common import is_object_dtype
 from sklearn.model_selection import KFold
 from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer

@ -280,6 +281,10 @@ class GeneralSelection(MLProcess):
                or df.loc[df[col] == np.inf].shape[0] != 0
            ):
                feats.remove(col)
+
+            if is_object_dtype(df[col]) and df[col].nunique() == df.shape[0]:
+                feats.remove(col)
+
        self.feats = feats

    def transform(self, df: pd.DataFrame) -> pd.DataFrame: