Merge dev to dev_tool_selection

2026-04-29 19:06:23 +02:00 · 2023-12-06 17:08:09 +08:00 · 2023-12-06 17:08:09 +08:00 · 56dd0ee882
commit 56dd0ee882
parent 0b918eb224 c69106f04a
8 changed files with 534 additions and 23 deletions
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@ -0,0 +1,123 @@
+
+import pandas as pd
+import numpy as np
+
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import KBinsDiscretizer
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MaxAbsScaler
+from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import OrdinalEncoder
+
+from metagpt.tools.functions import registry
+from metagpt.tools.functions.schemas.data_preprocess import *
+
+
+@registry.register("data_preprocess", FillMissingValue)
+def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
+    df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
+    return df
+
+
+# @registry.register("data_preprocess", FillMissingValue)
+# def label_encode(df: pd.DataFrame, features: list,):
+#     for col in features:
+#         df[col] = LabelEncoder().fit_transform(df[col])
+#     return df
+
+
+@registry.register("data_preprocess", SplitBins)
+def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
+    df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", MinMaxScale)
+def min_max_scale(df: pd.DataFrame, features: list, ):
+    df[features] = MinMaxScaler().fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", StandardScale)
+def standard_scale(df: pd.DataFrame, features: list, ):
+    df[features] = StandardScaler().fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", LogTransform)
+def log_transform(df: pd.DataFrame, features: list, ):
+    for col in features:
+        if df[col].min() <= 0:
+            df[col] = df[col] - df[col].min() + 2
+        df[col] = np.log(df[col])
+    return df
+
+
+@registry.register("data_preprocess", MaxAbsScale)
+def max_abs_scale(df: pd.DataFrame, features: list, ):
+    df[features] = MaxAbsScaler().fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", RobustScale)
+def robust_scale(df: pd.DataFrame, features: list, ):
+    df[features] = RobustScaler().fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", OrdinalEncode)
+def ordinal_encode(df: pd.DataFrame, features: list,):
+    df[features] = OrdinalEncoder().fit_transform(df[features])
+    return df
+
+
+if __name__ == '__main__':
+    def run():
+        V = {
+            'a': [-1, 2, 3, 6, 5, 4],
+            'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
+            'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
+            'd': [1, None, 3, None, 5, 4],
+            'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
+            'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
+
+        }
+
+        df = pd.DataFrame(V)
+        print(df.dtypes)
+
+        numeric_features = ['a', 'b', 'd', 'e']
+        numeric_features_wo_miss = ['a', 'b', ]
+        categorial_features = ['c', 'f']
+
+        df_ = fill_missing_value(df.copy(), numeric_features)
+        print(df_)
+        df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
+        print(df_)
+
+        df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
+        print(df_)
+
+        # df_ = label_encode(df.copy(), numeric_features + categorial_features, )
+        # print(df_)
+
+        df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
+        print(df_)
+
+        df_ = min_max_scale(df.copy(), numeric_features, )
+        print(df_)
+
+        df_ = standard_scale(df.copy(), numeric_features, )
+        print(df_)
+
+        df_ = log_transform(df.copy(), numeric_features, )
+        print(df_)
+
+        df_ = max_abs_scale(df.copy(), numeric_features, )
+        print(df_)
+
+        df_ = robust_scale(df.copy(), numeric_features, )
+        print(df_)
+    run()
--- a/metagpt/tools/functions/libs/ml_model.py
+++ b/metagpt/tools/functions/libs/ml_model.py
@ -0,0 +1,196 @@
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+
+
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import GradientBoostingRegressor
+
+from metagpt.tools.functions import registry
+from metagpt.tools.functions.schemas.ml_model import *
+
+
+#########
+## 分类 ##
+#########
+
+
+@registry.register("classification_model", LogisticRegressionClassification)
+def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+
+    model = LogisticRegression(penalty=penalty, dual=dual)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict_proba(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+@registry.register("classification_model", RandomForestClassification)
+def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict_proba(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+@registry.register("classification_model", GradientBoostingClassification)
+def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict_proba(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+
+#########
+## 回归 ##
+#########
+
+
+@registry.register("regression_model", LinearRegressionRegression)
+def linear_regression(df, label, test_size=0.2, ):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+
+    model = LinearRegression()
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+@registry.register("regression_model", RandomForestRegression)
+def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+    model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+@registry.register("regression_model", GradientBoostingRegression)
+def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+    model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+if __name__ == '__main__':
+    def run():
+        from sklearn.datasets import load_iris
+        loader = load_iris(as_frame=True)
+        df = loader['data']
+        df['target'] = loader['target']
+
+        df[df.columns[0]] = df[df.columns[0]].astype(str)
+        df[df.columns[1]] = df[df.columns[1]].astype(int)
+        df['target'] = df['target'].astype(str)
+
+        print(df)
+        print('####'*5)
+        res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False)
+        print(res['te_pred_prob'])
+
+        print('####'*5)
+        res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini')
+        print(res['te_pred_prob'])
+
+        print('####'*5)
+        res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
+        print(res['te_pred_prob'])
+
+        from sklearn.datasets import make_regression
+        import pandas as pd
+        loader = make_regression()
+        df = pd.DataFrame(loader[0])
+        df['target'] = loader[1]
+
+        df[df.columns[0]] = df[df.columns[0]].astype(str)
+        df[df.columns[1]] = df[df.columns[1]].astype(int)
+        # df['target'] = df['target'].astype(str)
+
+        print(df)
+        print('####' * 5)
+        res = linear_regression(df, 'target', test_size=0.25, )
+        print(res['te_pred_prob'])
+
+        print('####' * 5)
+        res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error')
+        print(res['te_pred_prob'])
+
+        print('####' * 5)
+        res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
+        print(res['te_pred_prob'])
+    run()
--- a/metagpt/tools/functions/schemas/data_preprocess.py
+++ b/metagpt/tools/functions/schemas/data_preprocess.py
@ -0,0 +1,62 @@
+
+import pandas as pd
+
+from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
+
+
+class FillMissingValue(ToolSchema):
+    """Completing missing values with simple strategies"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+    strategy: str = tool_field(description="the imputation strategy", default='mean')
+    fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
+
+
+# class LabelEncode(ToolSchema):
+#     """Completing missing values with simple strategies"""
+#     df: pd.DataFrame = tool_field(description="input dataframe")
+#     features: list = tool_field(description="columns to be processed")
+
+
+class SplitBins(ToolSchema):
+    """Bin continuous data into intervals and return the bin identifier encoded as an integer value"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+    strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile')
+
+
+class MinMaxScale(ToolSchema):
+    """Transform features by scaling each feature to a range, witch is (0, 1)"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class StandardScale(ToolSchema):
+    """Standardize features by removing the mean and scaling to unit variance"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class LogTransform(ToolSchema):
+    """Performs a logarithmic transformation on the specified columns"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class MaxAbsScale(ToolSchema):
+    """Scale each feature by its maximum absolute value"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class RobustScale(ToolSchema):
+    """Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class OrdinalEncode(ToolSchema):
+    """Encode categorical features as an integer array"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
--- a/metagpt/tools/functions/schemas/ml_model.py
+++ b/metagpt/tools/functions/schemas/ml_model.py
@ -0,0 +1,55 @@
+import pandas as pd
+
+from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
+
+
+class LogisticRegressionClassification(ToolSchema):
+    """Logistic Regression (aka logit, MaxEnt) classifier"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    penalty: str = tool_field(description="Specify the norm of the penalty", default="l2")
+    dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2")
+
+
+class RandomForestClassification(ToolSchema):
+    """random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
+    criterion: str = tool_field(description="The function to measure the quality of a split", default="gini")
+
+
+class GradientBoostingClassification(ToolSchema):
+    """Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
+    learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
+
+
+class LinearRegressionRegression(ToolSchema):
+    """Ordinary least squares Linear Regression."""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+
+
+class RandomForestRegression(ToolSchema):
+    """random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
+    criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error")
+
+
+class GradientBoostingRegression(ToolSchema):
+    """Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
+    learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)