update ml tools

This commit is contained in:
lidanyang 2023-12-13 13:37:40 +08:00
parent 49779d8615
commit f614fbfa7c
2 changed files with 8 additions and 13 deletions

View file

@ -1,6 +1,6 @@
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
@ -8,7 +8,6 @@ from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from metagpt.tools.functions import registry
from metagpt.tools.functions.libs.base import MLProcess
from metagpt.tools.functions.schemas.data_preprocess import *
@ -57,15 +56,6 @@ class StandardScale(MLProcess):
return df
@registry.register("data_preprocess", LogTransform)
def log_transform(df: pd.DataFrame, features: list, ):
for col in features:
if df[col].min() <= 0:
df[col] = df[col] - df[col].min() + 2
df[col] = np.log(df[col])
return df
class MaxAbsScale(MLProcess):
def __init__(self, features: list,):
self.features = features
@ -146,7 +136,7 @@ class LabelEncode(MLProcess):
return df
def get_column_info(df: pd.DataFrame) -> str:
def get_column_info(df: pd.DataFrame) -> dict:
data = []
for i in df.columns:
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
@ -157,7 +147,7 @@ def get_column_info(df: pd.DataFrame) -> str:
data,
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
)
return samples.to_string(index=False)
return samples.to_dict(orient='list')
#
#
# if __name__ == '__main__':

View file

@ -10,6 +10,7 @@ import numpy as np
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from pandas.api.types import is_numeric_dtype
from pandas.core.dtypes.common import is_object_dtype
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
@ -280,6 +281,10 @@ class GeneralSelection(MLProcess):
or df.loc[df[col] == np.inf].shape[0] != 0
):
feats.remove(col)
if is_object_dtype(df[col]) and df[col].nunique() == df.shape[0]:
feats.remove(col)
self.feats = feats
def transform(self, df: pd.DataFrame) -> pd.DataFrame: