diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py new file mode 100644 index 000000000..68c96bbc9 --- /dev/null +++ b/metagpt/tools/functions/libs/data_preprocess.py @@ -0,0 +1,123 @@ + +import pandas as pd +import numpy as np + +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import KBinsDiscretizer +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MaxAbsScaler +from sklearn.preprocessing import RobustScaler +from sklearn.preprocessing import OrdinalEncoder + +from metagpt.tools.functions import registry +from metagpt.tools.functions.schemas.data_preprocess import * + + +@registry.register("data_preprocess", FillMissingValue) +def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,): + df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features]) + return df + + +# @registry.register("data_preprocess", FillMissingValue) +# def label_encode(df: pd.DataFrame, features: list,): +# for col in features: +# df[col] = LabelEncoder().fit_transform(df[col]) +# return df + + +@registry.register("data_preprocess", SplitBins) +def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',): + df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", MinMaxScale) +def min_max_scale(df: pd.DataFrame, features: list, ): + df[features] = MinMaxScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", StandardScale) +def standard_scale(df: pd.DataFrame, features: list, ): + df[features] = StandardScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", LogTransform) +def log_transform(df: pd.DataFrame, features: list, ): + for col in features: + if df[col].min() <= 0: + df[col] = df[col] - df[col].min() + 2 + df[col] = np.log(df[col]) + return df + + +@registry.register("data_preprocess", MaxAbsScale) +def max_abs_scale(df: pd.DataFrame, features: list, ): + df[features] = MaxAbsScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", RobustScale) +def robust_scale(df: pd.DataFrame, features: list, ): + df[features] = RobustScaler().fit_transform(df[features]) + return df + + +@registry.register("data_preprocess", OrdinalEncode) +def ordinal_encode(df: pd.DataFrame, features: list,): + df[features] = OrdinalEncoder().fit_transform(df[features]) + return df + + +if __name__ == '__main__': + def run(): + V = { + 'a': [-1, 2, 3, 6, 5, 4], + 'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4], + 'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], + 'd': [1, None, 3, None, 5, 4], + 'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4], + 'f': ['aa', np.NAN, 'cc', None, '', 'ff'], + + } + + df = pd.DataFrame(V) + print(df.dtypes) + + numeric_features = ['a', 'b', 'd', 'e'] + numeric_features_wo_miss = ['a', 'b', ] + categorial_features = ['c', 'f'] + + df_ = fill_missing_value(df.copy(), numeric_features) + print(df_) + df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe') + print(df_) + + df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999) + print(df_) + + # df_ = label_encode(df.copy(), numeric_features + categorial_features, ) + # print(df_) + + df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile') + print(df_) + + df_ = min_max_scale(df.copy(), numeric_features, ) + print(df_) + + df_ = standard_scale(df.copy(), numeric_features, ) + print(df_) + + df_ = log_transform(df.copy(), numeric_features, ) + print(df_) + + df_ = max_abs_scale(df.copy(), numeric_features, ) + print(df_) + + df_ = robust_scale(df.copy(), numeric_features, ) + print(df_) + run() \ No newline at end of file diff --git a/metagpt/tools/functions/libs/ml_model.py b/metagpt/tools/functions/libs/ml_model.py new file mode 100644 index 000000000..b669de2c1 --- /dev/null +++ b/metagpt/tools/functions/libs/ml_model.py @@ -0,0 +1,196 @@ +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder + +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import GradientBoostingClassifier + + +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import GradientBoostingRegressor + +from metagpt.tools.functions import registry +from metagpt.tools.functions.schemas.ml_model import * + + +######### +## 分类 ## +######### + + +@registry.register("classification_model", LogisticRegressionClassification) +def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + + model = LogisticRegression(penalty=penalty, dual=dual) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict_proba(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("classification_model", RandomForestClassification) +def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict_proba(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("classification_model", GradientBoostingClassification) +def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict_proba(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + + +######### +## 回归 ## +######### + + +@registry.register("regression_model", LinearRegressionRegression) +def linear_regression(df, label, test_size=0.2, ): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + + model = LinearRegression() + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("regression_model", RandomForestRegression) +def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +@registry.register("regression_model", GradientBoostingRegression) +def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1): + nonnumeric_columns = [col for col in df if df[col].dtype == 'object'] + for col in nonnumeric_columns: + df[col] = LabelEncoder().fit_transform(df[col]) + df = df.fillna(0) + + features = [col for col in df if col != label] + x, y = df[features], df[label] + tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1) + model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate) + model.fit(tr_x, tr_y, ) + te_pred_prob = model.predict(te_x) + + res = { + 'te_pred_prob': te_pred_prob + } + return res + + +if __name__ == '__main__': + def run(): + from sklearn.datasets import load_iris + loader = load_iris(as_frame=True) + df = loader['data'] + df['target'] = loader['target'] + + df[df.columns[0]] = df[df.columns[0]].astype(str) + df[df.columns[1]] = df[df.columns[1]].astype(int) + df['target'] = df['target'].astype(str) + + print(df) + print('####'*5) + res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False) + print(res['te_pred_prob']) + + print('####'*5) + res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini') + print(res['te_pred_prob']) + + print('####'*5) + res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1) + print(res['te_pred_prob']) + + from sklearn.datasets import make_regression + import pandas as pd + loader = make_regression() + df = pd.DataFrame(loader[0]) + df['target'] = loader[1] + + df[df.columns[0]] = df[df.columns[0]].astype(str) + df[df.columns[1]] = df[df.columns[1]].astype(int) + # df['target'] = df['target'].astype(str) + + print(df) + print('####' * 5) + res = linear_regression(df, 'target', test_size=0.25, ) + print(res['te_pred_prob']) + + print('####' * 5) + res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error') + print(res['te_pred_prob']) + + print('####' * 5) + res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1) + print(res['te_pred_prob']) + run() \ No newline at end of file diff --git a/metagpt/tools/functions/schemas/data_preprocess.py b/metagpt/tools/functions/schemas/data_preprocess.py new file mode 100644 index 000000000..40e1d64e0 --- /dev/null +++ b/metagpt/tools/functions/schemas/data_preprocess.py @@ -0,0 +1,62 @@ + +import pandas as pd + +from metagpt.tools.functions.schemas.base import tool_field, ToolSchema + + +class FillMissingValue(ToolSchema): + """Completing missing values with simple strategies""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + strategy: str = tool_field(description="the imputation strategy", default='mean') + fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None) + + +# class LabelEncode(ToolSchema): +# """Completing missing values with simple strategies""" +# df: pd.DataFrame = tool_field(description="input dataframe") +# features: list = tool_field(description="columns to be processed") + + +class SplitBins(ToolSchema): + """Bin continuous data into intervals and return the bin identifier encoded as an integer value""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile') + + +class MinMaxScale(ToolSchema): + """Transform features by scaling each feature to a range, witch is (0, 1)""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class StandardScale(ToolSchema): + """Standardize features by removing the mean and scaling to unit variance""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class LogTransform(ToolSchema): + """Performs a logarithmic transformation on the specified columns""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class MaxAbsScale(ToolSchema): + """Scale each feature by its maximum absolute value""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class RobustScale(ToolSchema): + """Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + + +class OrdinalEncode(ToolSchema): + """Encode categorical features as an integer array""" + df: pd.DataFrame = tool_field(description="input dataframe") + features: list = tool_field(description="columns to be processed") + diff --git a/metagpt/tools/functions/schemas/ml_model.py b/metagpt/tools/functions/schemas/ml_model.py new file mode 100644 index 000000000..9268156af --- /dev/null +++ b/metagpt/tools/functions/schemas/ml_model.py @@ -0,0 +1,55 @@ +import pandas as pd + +from metagpt.tools.functions.schemas.base import tool_field, ToolSchema + + +class LogisticRegressionClassification(ToolSchema): + """Logistic Regression (aka logit, MaxEnt) classifier""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + penalty: str = tool_field(description="Specify the norm of the penalty", default="l2") + dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2") + + +class RandomForestClassification(ToolSchema): + """random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of trees in the forest", default=100) + criterion: str = tool_field(description="The function to measure the quality of a split", default="gini") + + +class GradientBoostingClassification(ToolSchema): + """Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100) + learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1) + + +class LinearRegressionRegression(ToolSchema): + """Ordinary least squares Linear Regression.""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + + +class RandomForestRegression(ToolSchema): + """random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of trees in the forest", default=100) + criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error") + + +class GradientBoostingRegression(ToolSchema): + """Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion""" + df: pd.DataFrame = tool_field(description="input dataframe") + label: str = tool_field(description="target name") + test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2) + n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100) + learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)