Merge branch 'dev' into dev_make_tools

This commit is contained in:
刘棒棒 2023-12-18 22:22:38 +08:00
commit ea7e11665d
24 changed files with 1801 additions and 1226 deletions

View file

@ -4,5 +4,3 @@
# @Author : lidanyang
# @File : __init__.py
# @Desc :
from metagpt.tools.functions.register.register import registry
import metagpt.tools.functions.libs.feature_engineering

View file

@ -0,0 +1,16 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/12/10 20:12
# @Author : lidanyang
# @File : base
# @Desc :
class MLProcess(object):
def fit(self, df):
raise NotImplementedError
def transform(self, df):
raise NotImplementedError
def fit_transform(self, df):
self.fit(df)
return self.transform(df)

View file

@ -1,123 +1,153 @@
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.data_preprocess import *
from metagpt.tools.functions.libs.base import MLProcess
@registry.register("data_preprocess", FillMissingValue)
def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
return df
class FillMissingValue(MLProcess):
def __init__(self, features: list, strategy: str = 'mean', fill_value=None,):
self.features = features
self.strategy = strategy
self.fill_value = fill_value
self.si = None
def fit(self, df: pd.DataFrame):
self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
self.si.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.si.transform(df[self.features])
return df
# @registry.register("data_preprocess", FillMissingValue)
# def label_encode(df: pd.DataFrame, features: list,):
# for col in features:
# df[col] = LabelEncoder().fit_transform(df[col])
# return df
class MinMaxScale(MLProcess):
def __init__(self, features: list,):
self.features = features
self.mms = None
def fit(self, df: pd.DataFrame):
self.mms = MinMaxScaler()
self.mms.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.mms.transform(df[self.features])
return df
@registry.register("data_preprocess", SplitBins)
def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
return df
class StandardScale(MLProcess):
def __init__(self, features: list,):
self.features = features
self.ss = None
def fit(self, df: pd.DataFrame):
self.ss = StandardScaler()
self.ss.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.ss.transform(df[self.features])
return df
@registry.register("data_preprocess", MinMaxScale)
def min_max_scale(df: pd.DataFrame, features: list, ):
df[features] = MinMaxScaler().fit_transform(df[features])
return df
class MaxAbsScale(MLProcess):
def __init__(self, features: list,):
self.features = features
self.mas = None
def fit(self, df: pd.DataFrame):
self.mas = MaxAbsScaler()
self.mas.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.mas.transform(df[self.features])
return df
@registry.register("data_preprocess", StandardScale)
def standard_scale(df: pd.DataFrame, features: list, ):
df[features] = StandardScaler().fit_transform(df[features])
return df
class RobustScale(MLProcess):
def __init__(self, features: list,):
self.features = features
self.rs = None
def fit(self, df: pd.DataFrame):
self.rs = RobustScaler()
self.rs.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.rs.transform(df[self.features])
return df
@registry.register("data_preprocess", LogTransform)
def log_transform(df: pd.DataFrame, features: list, ):
for col in features:
if df[col].min() <= 0:
df[col] = df[col] - df[col].min() + 2
df[col] = np.log(df[col])
return df
class OrdinalEncode(MLProcess):
def __init__(self, features: list,):
self.features = features
self.oe = None
def fit(self, df: pd.DataFrame):
self.oe = OrdinalEncoder()
self.oe.fit(df[self.features])
def transform(self, df: pd.DataFrame):
df[self.features] = self.oe.transform(df[self.features])
return df
@registry.register("data_preprocess", MaxAbsScale)
def max_abs_scale(df: pd.DataFrame, features: list, ):
df[features] = MaxAbsScaler().fit_transform(df[features])
return df
class OneHotEncode(MLProcess):
def __init__(self, features: list,):
self.features = features
self.ohe = None
def fit(self, df: pd.DataFrame):
self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
self.ohe.fit(df[self.features])
def transform(self, df: pd.DataFrame):
ts_data = self.ohe.transform(df[self.features])
new_columns = self.ohe.get_feature_names_out(self.features)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
df.drop(self.features, axis=1, inplace=True)
df = pd.concat([df, ts_data], axis=1)
return df
@registry.register("data_preprocess", RobustScale)
def robust_scale(df: pd.DataFrame, features: list, ):
df[features] = RobustScaler().fit_transform(df[features])
return df
class LabelEncode(MLProcess):
def __init__(self, features: list,):
self.features = features
self.le_encoders = []
def fit(self, df: pd.DataFrame):
for col in self.features:
le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
self.le_encoders.append(le)
def transform(self, df: pd.DataFrame):
for i in range(len(self.features)):
data_list = df[self.features[i]].astype(str).tolist()
for unique_item in np.unique(df[self.features[i]].astype(str)):
if unique_item not in self.le_encoders[i].classes_:
data_list = ['unknown' if x == unique_item else x for x in data_list]
df[self.features[i]] = self.le_encoders[i].transform(data_list)
return df
@registry.register("data_preprocess", OrdinalEncode)
def ordinal_encode(df: pd.DataFrame, features: list,):
df[features] = OrdinalEncoder().fit_transform(df[features])
return df
def get_column_info(df: pd.DataFrame) -> dict:
data = []
for i in df.columns:
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
n_unique = df[i].nunique()
data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "")
if data_type == "O":
data_type = "object"
data.append([i, data_type, nan_freq, n_unique])
if __name__ == '__main__':
def run():
V = {
'a': [-1, 2, 3, 6, 5, 4],
'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
'd': [1, None, 3, None, 5, 4],
'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
}
df = pd.DataFrame(V)
print(df.dtypes)
numeric_features = ['a', 'b', 'd', 'e']
numeric_features_wo_miss = ['a', 'b', ]
categorial_features = ['c', 'f']
df_ = fill_missing_value(df.copy(), numeric_features)
print(df_)
df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
print(df_)
df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
print(df_)
# df_ = label_encode(df.copy(), numeric_features + categorial_features, )
# print(df_)
df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
print(df_)
df_ = min_max_scale(df.copy(), numeric_features, )
print(df_)
df_ = standard_scale(df.copy(), numeric_features, )
print(df_)
df_ = log_transform(df.copy(), numeric_features, )
print(df_)
df_ = max_abs_scale(df.copy(), numeric_features, )
print(df_)
df_ = robust_scale(df.copy(), numeric_features, )
print(df_)
run()
samples = pd.DataFrame(
data,
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
)
return samples.to_dict(orient='list')

View file

@ -3,172 +3,290 @@
# @Time : 2023/11/17 10:33
# @Author : lidanyang
# @File : feature_engineering.py
# @Desc : Feature Engineering Functions
# @Desc : Feature Engineering Tools
import itertools
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from pandas.core.dtypes.common import is_object_dtype
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.feature_engineering import *
from metagpt.tools.functions.libs.base import MLProcess
@registry.register("feature_engineering", PolynomialExpansion)
def polynomial_expansion(df, cols, degree=2):
for col in cols:
if not is_numeric_dtype(df[col]):
raise ValueError(f"Column '{col}' must be numeric.")
class PolynomialExpansion(MLProcess):
def __init__(self, cols: list, degree: int = 2):
self.cols = cols
self.degree = degree
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
poly = PolynomialFeatures(degree=degree, include_bias=False)
ts_data = poly.fit_transform(df[cols].fillna(0))
new_columns = poly.get_feature_names_out(cols)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
ts_data = ts_data.drop(cols, axis=1)
df = pd.concat([df, ts_data], axis=1)
return df
def fit(self, df: pd.DataFrame):
self.poly.fit(df[self.cols].fillna(0))
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
ts_data = self.poly.transform(df[self.cols].fillna(0))
column_name = self.poly.get_feature_names_out(self.cols)
ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
df.drop(self.cols, axis=1, inplace=True)
df = pd.concat([df, ts_data], axis=1)
return df
@registry.register("feature_engineering", OneHotEncoding)
def one_hot_encoding(df, cols):
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
ts_data = enc.fit_transform(df[cols])
new_columns = enc.get_feature_names_out(cols)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
df.drop(cols, axis=1, inplace=True)
df = pd.concat([df, ts_data], axis=1)
return df
class CatCount(MLProcess):
def __init__(self, col: str):
self.col = col
self.encoder_dict = None
def fit(self, df: pd.DataFrame):
self.encoder_dict = df[self.col].value_counts().to_dict()
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[f"{self.col}_cnt"] = df[self.col].map(self.encoder_dict)
return df
@registry.register("feature_engineering", FrequencyEncoding)
def frequency_encoding(df, cols):
for col in cols:
encoder_dict = df[col].value_counts().to_dict()
df[f"{col}_cnt"] = df[col].map(encoder_dict)
return df
class TargetMeanEncoder(MLProcess):
def __init__(self, col: str, label: str):
self.col = col
self.label = label
self.encoder_dict = None
def fit(self, df: pd.DataFrame):
self.encoder_dict = df.groupby(self.col)[self.label].mean().to_dict()
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[f"{self.col}_target_mean"] = df[self.col].map(self.encoder_dict)
return df
@registry.register("feature_engineering", CatCross)
def cat_cross(df, cols, max_cat_num=100):
for col in cols:
if df[col].nunique() > max_cat_num:
cols.remove(col)
class KFoldTargetMeanEncoder(MLProcess):
def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
self.col = col
self.label = label
self.n_splits = n_splits
self.random_state = random_state
self.encoder_dict = None
for col1, col2 in itertools.combinations(cols, 2):
cross_col = f"{col1}_cross_{col2}"
df[cross_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
return df
def fit(self, df: pd.DataFrame):
tmp = df.copy()
kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
@registry.register("feature_engineering", GroupStat)
def group_stat(df, group_col, agg_col, agg_funcs):
group_df = df.groupby(group_col)[agg_col].agg(agg_funcs).reset_index()
group_df.columns = group_col + [
f"{agg_col}_{agg_func}_by_{group_col}" for agg_func in agg_funcs
]
df = df.merge(group_df, on=group_col, how="left")
return df
@registry.register("feature_engineering", ExtractTimeComps)
def extract_time_comps(df, time_col, time_comps):
time_s = pd.to_datetime(df[time_col], errors="coerce")
time_comps_df = pd.DataFrame()
if "year" in time_comps:
time_comps_df["year"] = time_s.dt.year
if "month" in time_comps:
time_comps_df["month"] = time_s.dt.month
if "day" in time_comps:
time_comps_df["day"] = time_s.dt.day
if "hour" in time_comps:
time_comps_df["hour"] = time_s.dt.hour
if "dayofweek" in time_comps:
time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
if "is_weekend" in time_comps:
time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
df = pd.concat([df, time_comps_df], axis=1)
return df
@registry.register("feature_engineering", FeShiftByTime)
def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
df[time_col] = pd.to_datetime(df[time_col])
def shift_datetime(date, offset, unit):
if unit in ["year", "y", "Y"]:
return date + relativedelta(years=offset)
elif unit in ["month", "m", "M"]:
return date + relativedelta(months=offset)
elif unit in ["day", "d", "D"]:
return date + relativedelta(days=offset)
elif unit in ["week", "w", "W"]:
return date + relativedelta(weeks=offset)
elif unit in ["hour", "h", "H"]:
return date + relativedelta(hours=offset)
else:
return date
def shift_by_time_on_key(
inner_df, time_col, group_col, shift_col, offset, unit, col_name
):
inner_df = inner_df.drop_duplicates()
inner_df[time_col] = inner_df[time_col].map(
lambda x: shift_datetime(x, offset, unit)
)
inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
shift_col
].mean()
inner_df.rename(columns={shift_col: col_name}, inplace=True)
return inner_df
shift_df = df[[time_col, group_col, shift_col]].copy()
for period in periods:
new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
tmp = shift_by_time_on_key(
shift_df, time_col, group_col, shift_col, period, freq, new_col_name
)
df = df.merge(tmp, on=[time_col, group_col], how="left")
return df
@registry.register("feature_engineering", FeRollingByTime)
def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
df[time_col] = pd.to_datetime(df[time_col])
def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
time_freq = {
"Y": [365 * offset, "D"],
"M": [30 * offset, "D"],
"D": [offset, "D"],
"W": [7 * offset, "D"],
"H": [offset, "h"],
}
if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
raise ValueError(f"Invalid agg function: {agg_func}")
rolling_feat = inner_df.rolling(
f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
)
rolling_feat = getattr(rolling_feat, agg_func)()
depth = df.columns.nlevels
rolling_feat = rolling_feat.stack(list(range(depth)))
rolling_feat.name = col_name
return rolling_feat
rolling_df = df[[time_col, group_col, rolling_col]].copy()
for period in periods:
for func in agg_funcs:
new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
tmp = pd.pivot_table(
rolling_df,
index=time_col,
values=rolling_col,
columns=group_col,
global_mean = tmp[self.label].mean()
col_name = f"{self.col}_kf_target_mean"
for trn_idx, val_idx in kf.split(tmp, tmp[self.label]):
_trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
tmp.loc[tmp.index[val_idx], col_name] = _val[self.col].map(
_trn.groupby(self.col)[self.label].mean()
)
tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
df = df.merge(tmp, on=[time_col, group_col], how="left")
tmp[col_name].fillna(global_mean, inplace=True)
self.encoder_dict = tmp.groupby(self.col)[col_name].mean().to_dict()
return df
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[f"{self.col}_kf_target_mean"] = df[self.col].map(self.encoder_dict)
return df
class CatCross(MLProcess):
def __init__(self, cols: list, max_cat_num: int = 100):
self.cols = cols
self.max_cat_num = max_cat_num
self.combs = []
self.combs_map = {}
@staticmethod
def cross_two(comb, df):
new_col = f'{comb[0]}_{comb[1]}'
new_col_combs = list(itertools.product(df[comb[0]].unique(), df[comb[1]].unique()))
ll = list(range(len(new_col_combs)))
comb_map = dict(zip(new_col_combs, ll))
return new_col, comb_map
def fit(self, df: pd.DataFrame):
for col in self.cols:
if df[col].nunique() > self.max_cat_num:
self.cols.remove(col)
self.combs = list(itertools.combinations(self.cols, 2))
res = Parallel(n_jobs=4, require='sharedmem')(
delayed(self.cross_two)(comb, df) for comb in self.combs)
self.combs_map = dict(res)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
for comb in self.combs:
new_col = f'{comb[0]}_{comb[1]}'
_map = self.combs_map[new_col]
df[new_col] = pd.Series(zip(df[comb[0]], df[comb[1]])).map(_map)
# set the unknown value to a new number
df[new_col].fillna(max(_map.values()) + 1, inplace=True)
df[new_col] = df[new_col].astype(int)
return df
class GroupStat(MLProcess):
def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
self.group_col = group_col
self.agg_col = agg_col
self.agg_funcs = agg_funcs
self.group_df = None
def fit(self, df: pd.DataFrame):
group_df = df.groupby(self.group_col)[self.agg_col].agg(self.agg_funcs).reset_index()
group_df.columns = [self.group_col] + [
f"{self.agg_col}_{agg_func}_by_{self.group_col}" for agg_func in self.agg_funcs
]
self.group_df = group_df
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.merge(self.group_df, on=self.group_col, how="left")
return df
class SplitBins(MLProcess):
def __init__(self, cols: str, strategy: str = 'quantile'):
self.cols = cols
self.strategy = strategy
self.encoder = None
def fit(self, df: pd.DataFrame):
self.encoder = KBinsDiscretizer(strategy=self.strategy, encode='ordinal')
self.encoder.fit(df[self.cols].fillna(0))
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
return df
# @registry.register("feature_engineering", ExtractTimeComps)
# def extract_time_comps(df, time_col, time_comps):
# time_s = pd.to_datetime(df[time_col], errors="coerce")
# time_comps_df = pd.DataFrame()
#
# if "year" in time_comps:
# time_comps_df["year"] = time_s.dt.year
# if "month" in time_comps:
# time_comps_df["month"] = time_s.dt.month
# if "day" in time_comps:
# time_comps_df["day"] = time_s.dt.day
# if "hour" in time_comps:
# time_comps_df["hour"] = time_s.dt.hour
# if "dayofweek" in time_comps:
# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
# if "is_weekend" in time_comps:
# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
# df = pd.concat([df, time_comps_df], axis=1)
# return df
#
#
# @registry.register("feature_engineering", FeShiftByTime)
# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
# df[time_col] = pd.to_datetime(df[time_col])
#
# def shift_datetime(date, offset, unit):
# if unit in ["year", "y", "Y"]:
# return date + relativedelta(years=offset)
# elif unit in ["month", "m", "M"]:
# return date + relativedelta(months=offset)
# elif unit in ["day", "d", "D"]:
# return date + relativedelta(days=offset)
# elif unit in ["week", "w", "W"]:
# return date + relativedelta(weeks=offset)
# elif unit in ["hour", "h", "H"]:
# return date + relativedelta(hours=offset)
# else:
# return date
#
# def shift_by_time_on_key(
# inner_df, time_col, group_col, shift_col, offset, unit, col_name
# ):
# inner_df = inner_df.drop_duplicates()
# inner_df[time_col] = inner_df[time_col].map(
# lambda x: shift_datetime(x, offset, unit)
# )
# inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
# shift_col
# ].mean()
# inner_df.rename(columns={shift_col: col_name}, inplace=True)
# return inner_df
#
# shift_df = df[[time_col, group_col, shift_col]].copy()
# for period in periods:
# new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
# tmp = shift_by_time_on_key(
# shift_df, time_col, group_col, shift_col, period, freq, new_col_name
# )
# df = df.merge(tmp, on=[time_col, group_col], how="left")
#
# return df
#
#
# @registry.register("feature_engineering", FeRollingByTime)
# def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
# df[time_col] = pd.to_datetime(df[time_col])
#
# def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
# time_freq = {
# "Y": [365 * offset, "D"],
# "M": [30 * offset, "D"],
# "D": [offset, "D"],
# "W": [7 * offset, "D"],
# "H": [offset, "h"],
# }
#
# if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
# raise ValueError(f"Invalid agg function: {agg_func}")
#
# rolling_feat = inner_df.rolling(
# f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
# )
# rolling_feat = getattr(rolling_feat, agg_func)()
# depth = df.columns.nlevels
# rolling_feat = rolling_feat.stack(list(range(depth)))
# rolling_feat.name = col_name
# return rolling_feat
#
# rolling_df = df[[time_col, group_col, rolling_col]].copy()
# for period in periods:
# for func in agg_funcs:
# new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
# tmp = pd.pivot_table(
# rolling_df,
# index=time_col,
# values=rolling_col,
# columns=group_col,
# )
# tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
# df = df.merge(tmp, on=[time_col, group_col], how="left")
#
# return df
class GeneralSelection(MLProcess):
def __init__(self, label_col: str):
self.label_col = label_col
self.feats = []
def fit(self, df: pd.DataFrame):
feats = [f for f in df.columns if f != self.label_col]
for col in df.columns:
if df[col].isnull().sum() / df.shape[0] == 1:
feats.remove(col)
if df[col].nunique() == 1:
feats.remove(col)
if (
df.loc[df[col] == np.inf].shape[0] != 0
or df.loc[df[col] == np.inf].shape[0] != 0
):
feats.remove(col)
if is_object_dtype(df[col]) and df[col].nunique() == df.shape[0]:
feats.remove(col)
self.feats = feats
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[self.feats + [self.label_col]]
return df

View file

@ -1,196 +0,0 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from metagpt.tools.functions import registry
from metagpt.tools.functions.schemas.ml_model import *
#########
## 分类 ##
#########
@registry.register("classification_model", LogisticRegressionClassification)
def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = LogisticRegression(penalty=penalty, dual=dual)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("classification_model", RandomForestClassification)
def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("classification_model", GradientBoostingClassification)
def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict_proba(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
#########
## 回归 ##
#########
@registry.register("regression_model", LinearRegressionRegression)
def linear_regression(df, label, test_size=0.2, ):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = LinearRegression()
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("regression_model", RandomForestRegression)
def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
@registry.register("regression_model", GradientBoostingRegression)
def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
for col in nonnumeric_columns:
df[col] = LabelEncoder().fit_transform(df[col])
df = df.fillna(0)
features = [col for col in df if col != label]
x, y = df[features], df[label]
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
model.fit(tr_x, tr_y, )
te_pred_prob = model.predict(te_x)
res = {
'te_pred_prob': te_pred_prob
}
return res
if __name__ == '__main__':
def run():
from sklearn.datasets import load_iris
loader = load_iris(as_frame=True)
df = loader['data']
df['target'] = loader['target']
df[df.columns[0]] = df[df.columns[0]].astype(str)
df[df.columns[1]] = df[df.columns[1]].astype(int)
df['target'] = df['target'].astype(str)
print(df)
print('####'*5)
res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False)
print(res['te_pred_prob'])
print('####'*5)
res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini')
print(res['te_pred_prob'])
print('####'*5)
res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
print(res['te_pred_prob'])
from sklearn.datasets import make_regression
import pandas as pd
loader = make_regression()
df = pd.DataFrame(loader[0])
df['target'] = loader[1]
df[df.columns[0]] = df[df.columns[0]].astype(str)
df[df.columns[1]] = df[df.columns[1]].astype(int)
# df['target'] = df['target'].astype(str)
print(df)
print('####' * 5)
res = linear_regression(df, 'target', test_size=0.25, )
print(res['te_pred_prob'])
print('####' * 5)
res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error')
print(res['te_pred_prob'])
print('####' * 5)
res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
print(res['te_pred_prob'])
run()

View file

@ -1,6 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/11/16 16:37
# @Author : lidanyang
# @File : __init__.py
# @Desc :

View file

@ -1,78 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/11/16 16:38
# @Author : lidanyang
# @File : register.py
# @Desc :
import inspect
from typing import Type, Optional, Callable, Dict, Union, List
from metagpt.tools.functions.schemas.base import ToolSchema
class FunctionRegistry:
def __init__(self):
self.functions: Dict[str, Dict[str, Dict]] = {}
@staticmethod
def _check_param_consistency(func_params, schema):
param_names = set(func_params.keys())
schema_names = set(schema["parameters"]["properties"].keys())
if param_names != schema_names:
raise ValueError("Function parameters do not match schema properties")
def register(self, module: str, tool_schema: Type[ToolSchema]) -> Callable:
def wrapper(func: Callable) -> Callable:
module_registry = self.functions.setdefault(module, {})
if func.__name__ in module_registry:
raise ValueError(f"Function {func.__name__} is already registered in {module}")
func_params = inspect.signature(func).parameters
schema = tool_schema.schema()
schema["name"] = func.__name__
self._check_param_consistency(func_params, schema)
module_registry[func.__name__] = {
"func": func,
"schema": schema,
}
return func
return wrapper
def get(self, module: str, name: str) -> Optional[Union[Callable, Dict]]:
"""Get function by module and name"""
module_registry = self.functions.get(module, {})
return module_registry.get(name)
def get_by_name(self, name: str) -> Optional[Dict]:
"""Get function by name"""
for module_registry in self.functions.values():
if name in module_registry:
return module_registry.get(name, {})
def get_all_by_module(self, module: str) -> Optional[Dict]:
"""Get all functions by module"""
return self.functions.get(module, {})
def get_schema(self, module: str, name: str) -> Optional[Dict]:
"""Get schema by module and name"""
module_registry = self.functions.get(module, {})
return module_registry.get(name, {}).get("schema")
def get_schemas(self, module: str, names: List[str]) -> List[Dict]:
"""Get schemas by module and names"""
module_registry = self.functions.get(module, {})
return [module_registry.get(name, {}).get("schema") for name in names]
def get_all_schema_by_module(self, module: str) -> List[Dict]:
"""Get all schemas by module"""
module_registry = self.functions.get(module, {})
return [v.get("schema") for v in module_registry.values()]
registry = FunctionRegistry()

View file

@ -1,100 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/11/16 16:34
# @Author : lidanyang
# @File : base.py
# @Desc : Build base class to generate schema for tool
from typing import Any, List, Optional, get_type_hints
class NoDefault:
"""
A class to represent a missing default value.
This is used to distinguish between a default value of None and a missing default value.
"""
pass
def tool_field(
description: str, default: Any = NoDefault(), enum: Optional[List[Any]] = None, **kwargs
):
"""
Create a field for a tool parameter.
Args:
description (str): A description of the field.
default (Any, optional): The default value for the field. Defaults to None.
enum (Optional[List[Any]], optional): A list of possible values for the field. Defaults to None.
**kwargs: Additional keyword arguments.
Returns:
dict: A dictionary representing the field with provided attributes.
"""
field_info = {
"description": description,
"default": default,
"enum": enum,
}
field_info.update(kwargs)
return field_info
class ToolSchema:
@staticmethod
def format_type(type_hint):
"""
Format a type hint into a string representation.
Args:
type_hint (type): The type hint to format.
Returns:
str: A string representation of the type hint.
"""
if isinstance(type_hint, type):
# Handle built-in types separately
if type_hint.__module__ == "builtins":
return type_hint.__name__
else:
return f"{type_hint.__module__}.{type_hint.__name__}"
elif hasattr(type_hint, "__origin__") and hasattr(type_hint, "__args__"):
# Handle generic types (like List[int])
origin_type = ToolSchema.format_type(type_hint.__origin__)
args_type = ", ".join(
[ToolSchema.format_type(t) for t in type_hint.__args__]
)
return f"{origin_type}[{args_type}]"
else:
return str(type_hint)
@classmethod
def schema(cls):
"""
Generate a schema dictionary for the class.
The schema includes the class name, description, and information about
each class parameter based on type hints and field definitions.
Returns:
dict: A dictionary representing the schema of the class.
"""
schema = {
"name": cls.__name__,
"description": cls.__doc__,
"parameters": {"type": "object", "properties": {}, "required": []},
}
type_hints = get_type_hints(cls)
for attr, type_hint in type_hints.items():
value = getattr(cls, attr, None)
if isinstance(value, dict):
# Process each attribute that is defined using the field function
prop_info = {k: v for k, v in value.items() if v is not None or k == "default"}
if isinstance(prop_info["default"], NoDefault):
del prop_info["default"]
prop_info["type"] = ToolSchema.format_type(type_hint)
schema["parameters"]["properties"][attr] = prop_info
# Check for required fields
if "default" not in prop_info:
schema["parameters"]["required"].append(attr)
return schema

View file

@ -1,62 +0,0 @@
import pandas as pd
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
class FillMissingValue(ToolSchema):
"""Completing missing values with simple strategies"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(description="the imputation strategy", default='mean')
fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
# class LabelEncode(ToolSchema):
# """Completing missing values with simple strategies"""
# df: pd.DataFrame = tool_field(description="input dataframe")
# features: list = tool_field(description="columns to be processed")
class SplitBins(ToolSchema):
"""Bin continuous data into intervals and return the bin identifier encoded as an integer value"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile')
class MinMaxScale(ToolSchema):
"""Transform features by scaling each feature to a range, witch is (0, 1)"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class StandardScale(ToolSchema):
"""Standardize features by removing the mean and scaling to unit variance"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class LogTransform(ToolSchema):
"""Performs a logarithmic transformation on the specified columns"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class MaxAbsScale(ToolSchema):
"""Scale each feature by its maximum absolute value"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class RobustScale(ToolSchema):
"""Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")
class OrdinalEncode(ToolSchema):
"""Encode categorical features as an integer array"""
df: pd.DataFrame = tool_field(description="input dataframe")
features: list = tool_field(description="columns to be processed")

View file

@ -0,0 +1,306 @@
FillMissingValue:
type: class
description: "Completing missing values with simple strategies"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
strategy:
type: str
description: "the imputation strategy"
default: mean
enum:
- mean
- median
- most_frequent
- constant
fill_value:
type: int
description: "fill_value is used to replace all occurrences of missing_values"
default: null
required:
- features
fit:
description: "Fit the FillMissingValue model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
MinMaxScale:
type: class
description: "Transform features by scaling each feature to a range, witch is (0, 1)"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
required:
- features
fit:
description: "Fit the MinMaxScale model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
StandardScale:
type: class
description: "Standardize features by removing the mean and scaling to unit variance"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
required:
- features
fit:
description: "Fit the StandardScale model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
MaxAbsScale:
type: class
description: "cale each feature by its maximum absolute value"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
required:
- features
fit:
description: "Fit the MaxAbsScale model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
LabelEncode:
type: class
description: "Apply label encoding to specified categorical columns in-place."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "Categorical columns to be label encoded"
required:
- features
fit:
description: "Fit the LabelEncode model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
OneHotEncode:
type: class
description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "Categorical columns to be one-hot encoded and dropped"
required:
- features
fit:
description: "Fit the OneHotEncoding model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -1,100 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/11/17 10:34
# @Author : lidanyang
# @File : feature_engineering.py
# @Desc : Schema for feature engineering functions
from typing import List
import pandas as pd
from metagpt.tools.functions.schemas.base import ToolSchema, tool_field
class PolynomialExpansion(ToolSchema):
"""Generate polynomial and interaction features from selected columns, excluding the bias column."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Columns for polynomial expansion.")
degree: int = tool_field(description="Degree of polynomial features.", default=2)
class OneHotEncoding(ToolSchema):
"""Apply one-hot encoding to specified categorical columns in a DataFrame."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Categorical columns to be one-hot encoded.")
class FrequencyEncoding(ToolSchema):
"""Convert categorical columns to frequency encoding."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Categorical columns to be frequency encoded.")
class CatCross(ToolSchema):
"""Create pairwise crossed features from categorical columns, joining values with '_'."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
cols: list = tool_field(description="Columns to be pairwise crossed.")
max_cat_num: int = tool_field(
description="Maximum unique categories per crossed feature.", default=100
)
class GroupStat(ToolSchema):
"""Perform aggregation operations on a specified column grouped by certain categories."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
group_col: str = tool_field(description="Column used for grouping.")
agg_col: str = tool_field(description="Column on which aggregation is performed.")
agg_funcs: list = tool_field(
description="""List of aggregation functions to apply, such as ['mean', 'std'].
Each function must be supported by pandas."""
)
class ExtractTimeComps(ToolSchema):
"""Extract specific time components from a designated time column in a DataFrame."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
time_col: str = tool_field(
description="The name of the column containing time data."
)
time_comps: List[str] = tool_field(
description="""List of time components to extract.
Each component must be in ['year', 'month', 'day', 'hour', 'dayofweek', 'is_weekend']."""
)
class FeShiftByTime(ToolSchema):
"""Shift column values in a DataFrame based on specified time intervals."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
time_col: str = tool_field(description="Column for time-based shifting.")
group_col: str = tool_field(description="Column for grouping before shifting.")
shift_col: str = tool_field(description="Column to shift.")
periods: list = tool_field(description="Time intervals for shifting.")
freq: str = tool_field(
description="Frequency unit for time intervals (e.g., 'D', 'M').",
enum=["D", "M", "Y", "W", "H"],
)
class FeRollingByTime(ToolSchema):
"""Calculate rolling statistics for a DataFrame column over time intervals."""
df: pd.DataFrame = tool_field(description="DataFrame to process.")
time_col: str = tool_field(description="Column for time-based rolling.")
group_col: str = tool_field(description="Column for grouping before rolling.")
rolling_col: str = tool_field(description="Column for rolling calculations.")
periods: list = tool_field(description="Window sizes for rolling.")
freq: str = tool_field(
description="Frequency unit for time windows (e.g., 'D', 'M').",
enum=["D", "M", "Y", "W", "H"],
)
agg_funcs: list = tool_field(
description="""List of aggregation functions for rolling, like ['mean', 'std'].
Each function must be in ['mean', 'std', 'min', 'max', 'median', 'sum', 'count']."""
)

View file

@ -0,0 +1,433 @@
PolynomialExpansion:
type: class
description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns for polynomial expansion."
degree:
type: int
description: "The degree of the polynomial features."
default: 2
required:
- cols
fit:
description: "Fit the PolynomialExpansion model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
CatCount:
type: class
description: "Add value counts of a categorical column as new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column for value counts."
required:
- col
fit:
description: "Fit the CatCount model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
TargetMeanEncoder:
type: class
description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be mean encoded."
label:
type: str
description: "Predicted label column."
required:
- col
- label
fit:
description: "Fit the TargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
KFoldTargetMeanEncoder:
type: class
description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be k-fold mean encoded."
label:
type: str
description: "Predicted label column."
n_splits:
type: int
description: "Number of splits for K-fold."
default: 5
random_state:
type: int
description: "Random seed."
default: 2021
required:
- col
- label
fit:
description: "Fit the KFoldTargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
CatCross:
type: class
description: "Add pairwise crossed features and convert them to numerical features."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be pairwise crossed."
max_cat_num:
type: int
description: "Maximum unique categories per crossed feature."
default: 100
required:
- cols
fit:
description: "Fit the CatCross model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
GroupStat:
type: class
description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
group_col:
type: str
description: "Column used for grouping."
agg_col:
type: str
description: "Column on which aggregation is performed."
agg_funcs:
type: list
description: >-
List of aggregation functions to apply, such as ['mean', 'std'].
Each function must be supported by pandas.
required:
- group_col
- agg_col
- agg_funcs
fit:
description: "Fit the GroupStat model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
SplitBins:
type: class
description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be binned inplace."
strategy:
type: str
description: "Strategy used to define the widths of the bins."
default: quantile
enum:
- quantile
- uniform
- kmeans
required:
- cols
fit:
description: "Fit the SplitBins model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
GeneralSelection:
type: class
description: "Drop all nan feats and feats with only one unique value."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
required:
- label_col
fit:
description: "Fit the GeneralSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -1,55 +0,0 @@
import pandas as pd
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
class LogisticRegressionClassification(ToolSchema):
"""Logistic Regression (aka logit, MaxEnt) classifier"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
penalty: str = tool_field(description="Specify the norm of the penalty", default="l2")
dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2")
class RandomForestClassification(ToolSchema):
"""random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
criterion: str = tool_field(description="The function to measure the quality of a split", default="gini")
class GradientBoostingClassification(ToolSchema):
"""Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
class LinearRegressionRegression(ToolSchema):
"""Ordinary least squares Linear Regression."""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
class RandomForestRegression(ToolSchema):
"""random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error")
class GradientBoostingRegression(ToolSchema):
"""Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion"""
df: pd.DataFrame = tool_field(description="input dataframe")
label: str = tool_field(description="target name")
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)