Merge dev to dev_tool_selection

This commit is contained in:
lidanyang 2023-12-06 17:08:09 +08:00
commit 56dd0ee882
8 changed files with 534 additions and 23 deletions

View file

@ -0,0 +1,123 @@
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.data_preprocess import *
@registry.register("data_preprocess", FillMissingValue)
def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
return df
# @registry.register("data_preprocess", FillMissingValue)
# def label_encode(df: pd.DataFrame, features: list,):
# for col in features:
# df[col] = LabelEncoder().fit_transform(df[col])
# return df
@registry.register("data_preprocess", SplitBins)
def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
return df
@registry.register("data_preprocess", MinMaxScale)
def min_max_scale(df: pd.DataFrame, features: list, ):
df[features] = MinMaxScaler().fit_transform(df[features])
return df
@registry.register("data_preprocess", StandardScale)
def standard_scale(df: pd.DataFrame, features: list, ):
df[features] = StandardScaler().fit_transform(df[features])
return df
@registry.register("data_preprocess", LogTransform)
def log_transform(df: pd.DataFrame, features: list, ):
for col in features:
if df[col].min() <= 0:
df[col] = df[col] - df[col].min() + 2
df[col] = np.log(df[col])
return df
@registry.register("data_preprocess", MaxAbsScale)
def max_abs_scale(df: pd.DataFrame, features: list, ):
df[features] = MaxAbsScaler().fit_transform(df[features])
return df
@registry.register("data_preprocess", RobustScale)
def robust_scale(df: pd.DataFrame, features: list, ):
df[features] = RobustScaler().fit_transform(df[features])
return df
@registry.register("data_preprocess", OrdinalEncode)
def ordinal_encode(df: pd.DataFrame, features: list,):
df[features] = OrdinalEncoder().fit_transform(df[features])
return df
if __name__ == '__main__':
def run():
V = {
'a': [-1, 2, 3, 6, 5, 4],
'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
'd': [1, None, 3, None, 5, 4],
'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
}
df = pd.DataFrame(V)
print(df.dtypes)
numeric_features = ['a', 'b', 'd', 'e']
numeric_features_wo_miss = ['a', 'b', ]
categorial_features = ['c', 'f']
df_ = fill_missing_value(df.copy(), numeric_features)
print(df_)
df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
print(df_)
df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
print(df_)
# df_ = label_encode(df.copy(), numeric_features + categorial_features, )
# print(df_)
df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
print(df_)
df_ = min_max_scale(df.copy(), numeric_features, )
print(df_)
df_ = standard_scale(df.copy(), numeric_features, )
print(df_)
df_ = log_transform(df.copy(), numeric_features, )
print(df_)
df_ = max_abs_scale(df.copy(), numeric_features, )
print(df_)
df_ = robust_scale(df.copy(), numeric_features, )
print(df_)
run()

View file

@ -0,0 +1,196 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.ml_model import *
#########
## 分类 ##
#########
@registry.register("classification_model", LogisticRegressionClassification)
def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = LogisticRegression(penalty=penalty, dual=dual)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("classification_model", RandomForestClassification)
def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("classification_model", GradientBoostingClassification)
def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
#########
## 回归 ##
#########
@registry.register("regression_model", LinearRegressionRegression)
def linear_regression(df, label, test_size=0.2, ):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = LinearRegression()
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("regression_model", RandomForestRegression)
def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("regression_model", GradientBoostingRegression)
def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
if __name__ == '__main__':
def run():
from sklearn.datasets import load_iris
loader = load_iris(as_frame=True)
df = loader['data']
df['target'] = loader['target']
df[df.columns[0]] = df[df.columns[0]].astype(str)
df[df.columns[1]] = df[df.columns[1]].astype(int)
df['target'] = df['target'].astype(str)
print(df)
print('####'*5)
res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False)
print(res['te_pred_prob'])
print('####'*5)
res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini')
print(res['te_pred_prob'])
print('####'*5)
res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
print(res['te_pred_prob'])
from sklearn.datasets import make_regression
import pandas as pd
loader = make_regression()
df = pd.DataFrame(loader[0])
df['target'] = loader[1]
df[df.columns[0]] = df[df.columns[0]].astype(str)
df[df.columns[1]] = df[df.columns[1]].astype(int)
# df['target'] = df['target'].astype(str)
print(df)
print('####' * 5)
res = linear_regression(df, 'target', test_size=0.25, )
print(res['te_pred_prob'])
print('####' * 5)
res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error')
print(res['te_pred_prob'])
print('####' * 5)
res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
print(res['te_pred_prob'])
run()

View file

@ -0,0 +1,62 @@
import pandas as pd
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
class FillMissingValue(ToolSchema):
"""Completing missing values with simple strategies"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(description="the imputation strategy", default='mean')
fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
# class LabelEncode(ToolSchema):
# """Completing missing values with simple strategies"""
# df: pd.DataFrame = tool_field(description="input dataframe")
# features: list = tool_field(description="columns to be processed")
class SplitBins(ToolSchema):
"""Bin continuous data into intervals and return the bin identifier encoded as an integer value"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile')
class MinMaxScale(ToolSchema):
"""Transform features by scaling each feature to a range, witch is (0, 1)"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class StandardScale(ToolSchema):
"""Standardize features by removing the mean and scaling to unit variance"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class LogTransform(ToolSchema):
"""Performs a logarithmic transformation on the specified columns"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class MaxAbsScale(ToolSchema):
"""Scale each feature by its maximum absolute value"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class RobustScale(ToolSchema):
"""Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class OrdinalEncode(ToolSchema):
"""Encode categorical features as an integer array"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")

View file

@ -0,0 +1,55 @@
import pandas as pd
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
class LogisticRegressionClassification(ToolSchema):
"""Logistic Regression (aka logit, MaxEnt) classifier"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
penalty: str = tool_field(description="Specify the norm of the penalty", default="l2")
dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2")
class RandomForestClassification(ToolSchema):
"""random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
criterion: str = tool_field(description="The function to measure the quality of a split", default="gini")
class GradientBoostingClassification(ToolSchema):
"""Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
class LinearRegressionRegression(ToolSchema):
"""Ordinary least squares Linear Regression."""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
class RandomForestRegression(ToolSchema):
"""random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error")
class GradientBoostingRegression(ToolSchema):
"""Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)