mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-23 15:48:11 +02:00
Merge branch 'dev' into dev_make_tools
This commit is contained in:
commit
ea7e11665d
24 changed files with 1801 additions and 1226 deletions
|
|
@ -4,5 +4,3 @@
|
|||
# @Author : lidanyang
|
||||
# @File : __init__.py
|
||||
# @Desc :
|
||||
from metagpt.tools.functions.register.register import registry
|
||||
import metagpt.tools.functions.libs.feature_engineering
|
||||
|
|
|
|||
16
metagpt/tools/functions/libs/base.py
Normal file
16
metagpt/tools/functions/libs/base.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2023/12/10 20:12
|
||||
# @Author : lidanyang
|
||||
# @File : base
|
||||
# @Desc :
|
||||
class MLProcess(object):
|
||||
def fit(self, df):
|
||||
raise NotImplementedError
|
||||
|
||||
def transform(self, df):
|
||||
raise NotImplementedError
|
||||
|
||||
def fit_transform(self, df):
|
||||
self.fit(df)
|
||||
return self.transform(df)
|
||||
|
|
@ -1,123 +1,153 @@
|
|||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.preprocessing import KBinsDiscretizer
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import MaxAbsScaler
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.preprocessing import OrdinalEncoder
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from metagpt.tools.functions import registry
|
||||
from metagpt.tools.functions.schemas.data_preprocess import *
|
||||
from metagpt.tools.functions.libs.base import MLProcess
|
||||
|
||||
|
||||
@registry.register("data_preprocess", FillMissingValue)
|
||||
def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
|
||||
df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
|
||||
return df
|
||||
class FillMissingValue(MLProcess):
|
||||
def __init__(self, features: list, strategy: str = 'mean', fill_value=None,):
|
||||
self.features = features
|
||||
self.strategy = strategy
|
||||
self.fill_value = fill_value
|
||||
self.si = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
|
||||
self.si.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.si.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
# @registry.register("data_preprocess", FillMissingValue)
|
||||
# def label_encode(df: pd.DataFrame, features: list,):
|
||||
# for col in features:
|
||||
# df[col] = LabelEncoder().fit_transform(df[col])
|
||||
# return df
|
||||
class MinMaxScale(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.mms = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.mms = MinMaxScaler()
|
||||
self.mms.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.mms.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", SplitBins)
|
||||
def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
|
||||
df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
|
||||
return df
|
||||
class StandardScale(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.ss = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.ss = StandardScaler()
|
||||
self.ss.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.ss.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", MinMaxScale)
|
||||
def min_max_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = MinMaxScaler().fit_transform(df[features])
|
||||
return df
|
||||
class MaxAbsScale(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.mas = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.mas = MaxAbsScaler()
|
||||
self.mas.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.mas.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", StandardScale)
|
||||
def standard_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = StandardScaler().fit_transform(df[features])
|
||||
return df
|
||||
class RobustScale(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.rs = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.rs = RobustScaler()
|
||||
self.rs.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.rs.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", LogTransform)
|
||||
def log_transform(df: pd.DataFrame, features: list, ):
|
||||
for col in features:
|
||||
if df[col].min() <= 0:
|
||||
df[col] = df[col] - df[col].min() + 2
|
||||
df[col] = np.log(df[col])
|
||||
return df
|
||||
class OrdinalEncode(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.oe = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.oe = OrdinalEncoder()
|
||||
self.oe.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
df[self.features] = self.oe.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", MaxAbsScale)
|
||||
def max_abs_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = MaxAbsScaler().fit_transform(df[features])
|
||||
return df
|
||||
class OneHotEncode(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.ohe = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
self.ohe.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
ts_data = self.ohe.transform(df[self.features])
|
||||
new_columns = self.ohe.get_feature_names_out(self.features)
|
||||
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
|
||||
df.drop(self.features, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", RobustScale)
|
||||
def robust_scale(df: pd.DataFrame, features: list, ):
|
||||
df[features] = RobustScaler().fit_transform(df[features])
|
||||
return df
|
||||
class LabelEncode(MLProcess):
|
||||
def __init__(self, features: list,):
|
||||
self.features = features
|
||||
self.le_encoders = []
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
for col in self.features:
|
||||
le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
|
||||
self.le_encoders.append(le)
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
for i in range(len(self.features)):
|
||||
data_list = df[self.features[i]].astype(str).tolist()
|
||||
for unique_item in np.unique(df[self.features[i]].astype(str)):
|
||||
if unique_item not in self.le_encoders[i].classes_:
|
||||
data_list = ['unknown' if x == unique_item else x for x in data_list]
|
||||
df[self.features[i]] = self.le_encoders[i].transform(data_list)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("data_preprocess", OrdinalEncode)
|
||||
def ordinal_encode(df: pd.DataFrame, features: list,):
|
||||
df[features] = OrdinalEncoder().fit_transform(df[features])
|
||||
return df
|
||||
def get_column_info(df: pd.DataFrame) -> dict:
|
||||
data = []
|
||||
for i in df.columns:
|
||||
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
|
||||
n_unique = df[i].nunique()
|
||||
data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "")
|
||||
if data_type == "O":
|
||||
data_type = "object"
|
||||
data.append([i, data_type, nan_freq, n_unique])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def run():
|
||||
V = {
|
||||
'a': [-1, 2, 3, 6, 5, 4],
|
||||
'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
|
||||
'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
|
||||
'd': [1, None, 3, None, 5, 4],
|
||||
'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
|
||||
'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
|
||||
|
||||
}
|
||||
|
||||
df = pd.DataFrame(V)
|
||||
print(df.dtypes)
|
||||
|
||||
numeric_features = ['a', 'b', 'd', 'e']
|
||||
numeric_features_wo_miss = ['a', 'b', ]
|
||||
categorial_features = ['c', 'f']
|
||||
|
||||
df_ = fill_missing_value(df.copy(), numeric_features)
|
||||
print(df_)
|
||||
df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
|
||||
print(df_)
|
||||
|
||||
df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
|
||||
print(df_)
|
||||
|
||||
# df_ = label_encode(df.copy(), numeric_features + categorial_features, )
|
||||
# print(df_)
|
||||
|
||||
df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
|
||||
print(df_)
|
||||
|
||||
df_ = min_max_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = standard_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = log_transform(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = max_abs_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
|
||||
df_ = robust_scale(df.copy(), numeric_features, )
|
||||
print(df_)
|
||||
run()
|
||||
samples = pd.DataFrame(
|
||||
data,
|
||||
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
|
||||
)
|
||||
return samples.to_dict(orient='list')
|
||||
|
|
|
|||
|
|
@ -3,172 +3,290 @@
|
|||
# @Time : 2023/11/17 10:33
|
||||
# @Author : lidanyang
|
||||
# @File : feature_engineering.py
|
||||
# @Desc : Feature Engineering Functions
|
||||
# @Desc : Feature Engineering Tools
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from joblib import Parallel, delayed
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
|
||||
from pandas.core.dtypes.common import is_object_dtype
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
|
||||
|
||||
from metagpt.tools.functions import registry
|
||||
from metagpt.tools.functions.schemas.feature_engineering import *
|
||||
from metagpt.tools.functions.libs.base import MLProcess
|
||||
|
||||
|
||||
@registry.register("feature_engineering", PolynomialExpansion)
|
||||
def polynomial_expansion(df, cols, degree=2):
|
||||
for col in cols:
|
||||
if not is_numeric_dtype(df[col]):
|
||||
raise ValueError(f"Column '{col}' must be numeric.")
|
||||
class PolynomialExpansion(MLProcess):
|
||||
def __init__(self, cols: list, degree: int = 2):
|
||||
self.cols = cols
|
||||
self.degree = degree
|
||||
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
|
||||
|
||||
poly = PolynomialFeatures(degree=degree, include_bias=False)
|
||||
ts_data = poly.fit_transform(df[cols].fillna(0))
|
||||
new_columns = poly.get_feature_names_out(cols)
|
||||
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
|
||||
ts_data = ts_data.drop(cols, axis=1)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.poly.fit(df[self.cols].fillna(0))
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
ts_data = self.poly.transform(df[self.cols].fillna(0))
|
||||
column_name = self.poly.get_feature_names_out(self.cols)
|
||||
ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
|
||||
df.drop(self.cols, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", OneHotEncoding)
|
||||
def one_hot_encoding(df, cols):
|
||||
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
ts_data = enc.fit_transform(df[cols])
|
||||
new_columns = enc.get_feature_names_out(cols)
|
||||
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
|
||||
df.drop(cols, axis=1, inplace=True)
|
||||
df = pd.concat([df, ts_data], axis=1)
|
||||
return df
|
||||
class CatCount(MLProcess):
|
||||
def __init__(self, col: str):
|
||||
self.col = col
|
||||
self.encoder_dict = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.encoder_dict = df[self.col].value_counts().to_dict()
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[f"{self.col}_cnt"] = df[self.col].map(self.encoder_dict)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", FrequencyEncoding)
|
||||
def frequency_encoding(df, cols):
|
||||
for col in cols:
|
||||
encoder_dict = df[col].value_counts().to_dict()
|
||||
df[f"{col}_cnt"] = df[col].map(encoder_dict)
|
||||
return df
|
||||
class TargetMeanEncoder(MLProcess):
|
||||
def __init__(self, col: str, label: str):
|
||||
self.col = col
|
||||
self.label = label
|
||||
self.encoder_dict = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.encoder_dict = df.groupby(self.col)[self.label].mean().to_dict()
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[f"{self.col}_target_mean"] = df[self.col].map(self.encoder_dict)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", CatCross)
|
||||
def cat_cross(df, cols, max_cat_num=100):
|
||||
for col in cols:
|
||||
if df[col].nunique() > max_cat_num:
|
||||
cols.remove(col)
|
||||
class KFoldTargetMeanEncoder(MLProcess):
|
||||
def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
|
||||
self.col = col
|
||||
self.label = label
|
||||
self.n_splits = n_splits
|
||||
self.random_state = random_state
|
||||
self.encoder_dict = None
|
||||
|
||||
for col1, col2 in itertools.combinations(cols, 2):
|
||||
cross_col = f"{col1}_cross_{col2}"
|
||||
df[cross_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
|
||||
return df
|
||||
def fit(self, df: pd.DataFrame):
|
||||
tmp = df.copy()
|
||||
kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
|
||||
|
||||
|
||||
@registry.register("feature_engineering", GroupStat)
|
||||
def group_stat(df, group_col, agg_col, agg_funcs):
|
||||
group_df = df.groupby(group_col)[agg_col].agg(agg_funcs).reset_index()
|
||||
group_df.columns = group_col + [
|
||||
f"{agg_col}_{agg_func}_by_{group_col}" for agg_func in agg_funcs
|
||||
]
|
||||
df = df.merge(group_df, on=group_col, how="left")
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", ExtractTimeComps)
|
||||
def extract_time_comps(df, time_col, time_comps):
|
||||
time_s = pd.to_datetime(df[time_col], errors="coerce")
|
||||
time_comps_df = pd.DataFrame()
|
||||
|
||||
if "year" in time_comps:
|
||||
time_comps_df["year"] = time_s.dt.year
|
||||
if "month" in time_comps:
|
||||
time_comps_df["month"] = time_s.dt.month
|
||||
if "day" in time_comps:
|
||||
time_comps_df["day"] = time_s.dt.day
|
||||
if "hour" in time_comps:
|
||||
time_comps_df["hour"] = time_s.dt.hour
|
||||
if "dayofweek" in time_comps:
|
||||
time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
|
||||
if "is_weekend" in time_comps:
|
||||
time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
|
||||
df = pd.concat([df, time_comps_df], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", FeShiftByTime)
|
||||
def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
|
||||
df[time_col] = pd.to_datetime(df[time_col])
|
||||
|
||||
def shift_datetime(date, offset, unit):
|
||||
if unit in ["year", "y", "Y"]:
|
||||
return date + relativedelta(years=offset)
|
||||
elif unit in ["month", "m", "M"]:
|
||||
return date + relativedelta(months=offset)
|
||||
elif unit in ["day", "d", "D"]:
|
||||
return date + relativedelta(days=offset)
|
||||
elif unit in ["week", "w", "W"]:
|
||||
return date + relativedelta(weeks=offset)
|
||||
elif unit in ["hour", "h", "H"]:
|
||||
return date + relativedelta(hours=offset)
|
||||
else:
|
||||
return date
|
||||
|
||||
def shift_by_time_on_key(
|
||||
inner_df, time_col, group_col, shift_col, offset, unit, col_name
|
||||
):
|
||||
inner_df = inner_df.drop_duplicates()
|
||||
inner_df[time_col] = inner_df[time_col].map(
|
||||
lambda x: shift_datetime(x, offset, unit)
|
||||
)
|
||||
inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
|
||||
shift_col
|
||||
].mean()
|
||||
inner_df.rename(columns={shift_col: col_name}, inplace=True)
|
||||
return inner_df
|
||||
|
||||
shift_df = df[[time_col, group_col, shift_col]].copy()
|
||||
for period in periods:
|
||||
new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
|
||||
tmp = shift_by_time_on_key(
|
||||
shift_df, time_col, group_col, shift_col, period, freq, new_col_name
|
||||
)
|
||||
df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
@registry.register("feature_engineering", FeRollingByTime)
|
||||
def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
|
||||
df[time_col] = pd.to_datetime(df[time_col])
|
||||
|
||||
def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
|
||||
time_freq = {
|
||||
"Y": [365 * offset, "D"],
|
||||
"M": [30 * offset, "D"],
|
||||
"D": [offset, "D"],
|
||||
"W": [7 * offset, "D"],
|
||||
"H": [offset, "h"],
|
||||
}
|
||||
|
||||
if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
|
||||
raise ValueError(f"Invalid agg function: {agg_func}")
|
||||
|
||||
rolling_feat = inner_df.rolling(
|
||||
f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
|
||||
)
|
||||
rolling_feat = getattr(rolling_feat, agg_func)()
|
||||
depth = df.columns.nlevels
|
||||
rolling_feat = rolling_feat.stack(list(range(depth)))
|
||||
rolling_feat.name = col_name
|
||||
return rolling_feat
|
||||
|
||||
rolling_df = df[[time_col, group_col, rolling_col]].copy()
|
||||
for period in periods:
|
||||
for func in agg_funcs:
|
||||
new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
|
||||
tmp = pd.pivot_table(
|
||||
rolling_df,
|
||||
index=time_col,
|
||||
values=rolling_col,
|
||||
columns=group_col,
|
||||
global_mean = tmp[self.label].mean()
|
||||
col_name = f"{self.col}_kf_target_mean"
|
||||
for trn_idx, val_idx in kf.split(tmp, tmp[self.label]):
|
||||
_trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
|
||||
tmp.loc[tmp.index[val_idx], col_name] = _val[self.col].map(
|
||||
_trn.groupby(self.col)[self.label].mean()
|
||||
)
|
||||
tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
|
||||
df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
tmp[col_name].fillna(global_mean, inplace=True)
|
||||
self.encoder_dict = tmp.groupby(self.col)[col_name].mean().to_dict()
|
||||
|
||||
return df
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[f"{self.col}_kf_target_mean"] = df[self.col].map(self.encoder_dict)
|
||||
return df
|
||||
|
||||
|
||||
class CatCross(MLProcess):
|
||||
def __init__(self, cols: list, max_cat_num: int = 100):
|
||||
self.cols = cols
|
||||
self.max_cat_num = max_cat_num
|
||||
self.combs = []
|
||||
self.combs_map = {}
|
||||
|
||||
@staticmethod
|
||||
def cross_two(comb, df):
|
||||
new_col = f'{comb[0]}_{comb[1]}'
|
||||
new_col_combs = list(itertools.product(df[comb[0]].unique(), df[comb[1]].unique()))
|
||||
ll = list(range(len(new_col_combs)))
|
||||
comb_map = dict(zip(new_col_combs, ll))
|
||||
return new_col, comb_map
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
for col in self.cols:
|
||||
if df[col].nunique() > self.max_cat_num:
|
||||
self.cols.remove(col)
|
||||
self.combs = list(itertools.combinations(self.cols, 2))
|
||||
res = Parallel(n_jobs=4, require='sharedmem')(
|
||||
delayed(self.cross_two)(comb, df) for comb in self.combs)
|
||||
self.combs_map = dict(res)
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
for comb in self.combs:
|
||||
new_col = f'{comb[0]}_{comb[1]}'
|
||||
_map = self.combs_map[new_col]
|
||||
df[new_col] = pd.Series(zip(df[comb[0]], df[comb[1]])).map(_map)
|
||||
# set the unknown value to a new number
|
||||
df[new_col].fillna(max(_map.values()) + 1, inplace=True)
|
||||
df[new_col] = df[new_col].astype(int)
|
||||
return df
|
||||
|
||||
|
||||
class GroupStat(MLProcess):
|
||||
def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
|
||||
self.group_col = group_col
|
||||
self.agg_col = agg_col
|
||||
self.agg_funcs = agg_funcs
|
||||
self.group_df = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
group_df = df.groupby(self.group_col)[self.agg_col].agg(self.agg_funcs).reset_index()
|
||||
group_df.columns = [self.group_col] + [
|
||||
f"{self.agg_col}_{agg_func}_by_{self.group_col}" for agg_func in self.agg_funcs
|
||||
]
|
||||
self.group_df = group_df
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df.merge(self.group_df, on=self.group_col, how="left")
|
||||
return df
|
||||
|
||||
|
||||
class SplitBins(MLProcess):
|
||||
def __init__(self, cols: str, strategy: str = 'quantile'):
|
||||
self.cols = cols
|
||||
self.strategy = strategy
|
||||
self.encoder = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
self.encoder = KBinsDiscretizer(strategy=self.strategy, encode='ordinal')
|
||||
self.encoder.fit(df[self.cols].fillna(0))
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
|
||||
return df
|
||||
|
||||
# @registry.register("feature_engineering", ExtractTimeComps)
|
||||
# def extract_time_comps(df, time_col, time_comps):
|
||||
# time_s = pd.to_datetime(df[time_col], errors="coerce")
|
||||
# time_comps_df = pd.DataFrame()
|
||||
#
|
||||
# if "year" in time_comps:
|
||||
# time_comps_df["year"] = time_s.dt.year
|
||||
# if "month" in time_comps:
|
||||
# time_comps_df["month"] = time_s.dt.month
|
||||
# if "day" in time_comps:
|
||||
# time_comps_df["day"] = time_s.dt.day
|
||||
# if "hour" in time_comps:
|
||||
# time_comps_df["hour"] = time_s.dt.hour
|
||||
# if "dayofweek" in time_comps:
|
||||
# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
|
||||
# if "is_weekend" in time_comps:
|
||||
# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
|
||||
# df = pd.concat([df, time_comps_df], axis=1)
|
||||
# return df
|
||||
#
|
||||
#
|
||||
# @registry.register("feature_engineering", FeShiftByTime)
|
||||
# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
|
||||
# df[time_col] = pd.to_datetime(df[time_col])
|
||||
#
|
||||
# def shift_datetime(date, offset, unit):
|
||||
# if unit in ["year", "y", "Y"]:
|
||||
# return date + relativedelta(years=offset)
|
||||
# elif unit in ["month", "m", "M"]:
|
||||
# return date + relativedelta(months=offset)
|
||||
# elif unit in ["day", "d", "D"]:
|
||||
# return date + relativedelta(days=offset)
|
||||
# elif unit in ["week", "w", "W"]:
|
||||
# return date + relativedelta(weeks=offset)
|
||||
# elif unit in ["hour", "h", "H"]:
|
||||
# return date + relativedelta(hours=offset)
|
||||
# else:
|
||||
# return date
|
||||
#
|
||||
# def shift_by_time_on_key(
|
||||
# inner_df, time_col, group_col, shift_col, offset, unit, col_name
|
||||
# ):
|
||||
# inner_df = inner_df.drop_duplicates()
|
||||
# inner_df[time_col] = inner_df[time_col].map(
|
||||
# lambda x: shift_datetime(x, offset, unit)
|
||||
# )
|
||||
# inner_df = inner_df.groupby([time_col, group_col], as_index=False)[
|
||||
# shift_col
|
||||
# ].mean()
|
||||
# inner_df.rename(columns={shift_col: col_name}, inplace=True)
|
||||
# return inner_df
|
||||
#
|
||||
# shift_df = df[[time_col, group_col, shift_col]].copy()
|
||||
# for period in periods:
|
||||
# new_col_name = f"{group_col}_{shift_col}_lag_{period}_{freq}"
|
||||
# tmp = shift_by_time_on_key(
|
||||
# shift_df, time_col, group_col, shift_col, period, freq, new_col_name
|
||||
# )
|
||||
# df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
#
|
||||
# return df
|
||||
#
|
||||
#
|
||||
# @registry.register("feature_engineering", FeRollingByTime)
|
||||
# def fe_rolling_by_time(df, time_col, group_col, rolling_col, periods, freq, agg_funcs):
|
||||
# df[time_col] = pd.to_datetime(df[time_col])
|
||||
#
|
||||
# def rolling_by_time_on_key(inner_df, offset, unit, agg_func, col_name):
|
||||
# time_freq = {
|
||||
# "Y": [365 * offset, "D"],
|
||||
# "M": [30 * offset, "D"],
|
||||
# "D": [offset, "D"],
|
||||
# "W": [7 * offset, "D"],
|
||||
# "H": [offset, "h"],
|
||||
# }
|
||||
#
|
||||
# if agg_func not in ["mean", "std", "max", "min", "median", "sum", "count"]:
|
||||
# raise ValueError(f"Invalid agg function: {agg_func}")
|
||||
#
|
||||
# rolling_feat = inner_df.rolling(
|
||||
# f"{time_freq[unit][0]}{time_freq[unit][1]}", closed="left"
|
||||
# )
|
||||
# rolling_feat = getattr(rolling_feat, agg_func)()
|
||||
# depth = df.columns.nlevels
|
||||
# rolling_feat = rolling_feat.stack(list(range(depth)))
|
||||
# rolling_feat.name = col_name
|
||||
# return rolling_feat
|
||||
#
|
||||
# rolling_df = df[[time_col, group_col, rolling_col]].copy()
|
||||
# for period in periods:
|
||||
# for func in agg_funcs:
|
||||
# new_col_name = f"{group_col}_{rolling_col}_rolling_{period}_{freq}_{func}"
|
||||
# tmp = pd.pivot_table(
|
||||
# rolling_df,
|
||||
# index=time_col,
|
||||
# values=rolling_col,
|
||||
# columns=group_col,
|
||||
# )
|
||||
# tmp = rolling_by_time_on_key(tmp, period, freq, func, new_col_name)
|
||||
# df = df.merge(tmp, on=[time_col, group_col], how="left")
|
||||
#
|
||||
# return df
|
||||
|
||||
|
||||
class GeneralSelection(MLProcess):
|
||||
def __init__(self, label_col: str):
|
||||
self.label_col = label_col
|
||||
self.feats = []
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
feats = [f for f in df.columns if f != self.label_col]
|
||||
for col in df.columns:
|
||||
if df[col].isnull().sum() / df.shape[0] == 1:
|
||||
feats.remove(col)
|
||||
|
||||
if df[col].nunique() == 1:
|
||||
feats.remove(col)
|
||||
|
||||
if (
|
||||
df.loc[df[col] == np.inf].shape[0] != 0
|
||||
or df.loc[df[col] == np.inf].shape[0] != 0
|
||||
):
|
||||
feats.remove(col)
|
||||
|
||||
if is_object_dtype(df[col]) and df[col].nunique() == df.shape[0]:
|
||||
feats.remove(col)
|
||||
|
||||
self.feats = feats
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df[self.feats + [self.label_col]]
|
||||
return df
|
||||
|
|
|
|||
|
|
@ -1,196 +0,0 @@
|
|||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
|
||||
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
|
||||
from metagpt.tools.functions import registry
|
||||
from metagpt.tools.functions.schemas.ml_model import *
|
||||
|
||||
|
||||
#########
|
||||
## 分类 ##
|
||||
#########
|
||||
|
||||
|
||||
@registry.register("classification_model", LogisticRegressionClassification)
|
||||
def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
|
||||
model = LogisticRegression(penalty=penalty, dual=dual)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict_proba(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
@registry.register("classification_model", RandomForestClassification)
|
||||
def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict_proba(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
@registry.register("classification_model", GradientBoostingClassification)
|
||||
def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict_proba(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
|
||||
#########
|
||||
## 回归 ##
|
||||
#########
|
||||
|
||||
|
||||
@registry.register("regression_model", LinearRegressionRegression)
|
||||
def linear_regression(df, label, test_size=0.2, ):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
|
||||
model = LinearRegression()
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
@registry.register("regression_model", RandomForestRegression)
|
||||
def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
@registry.register("regression_model", GradientBoostingRegression)
|
||||
def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
|
||||
nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
|
||||
for col in nonnumeric_columns:
|
||||
df[col] = LabelEncoder().fit_transform(df[col])
|
||||
df = df.fillna(0)
|
||||
|
||||
features = [col for col in df if col != label]
|
||||
x, y = df[features], df[label]
|
||||
tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
|
||||
model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
|
||||
model.fit(tr_x, tr_y, )
|
||||
te_pred_prob = model.predict(te_x)
|
||||
|
||||
res = {
|
||||
'te_pred_prob': te_pred_prob
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def run():
|
||||
from sklearn.datasets import load_iris
|
||||
loader = load_iris(as_frame=True)
|
||||
df = loader['data']
|
||||
df['target'] = loader['target']
|
||||
|
||||
df[df.columns[0]] = df[df.columns[0]].astype(str)
|
||||
df[df.columns[1]] = df[df.columns[1]].astype(int)
|
||||
df['target'] = df['target'].astype(str)
|
||||
|
||||
print(df)
|
||||
print('####'*5)
|
||||
res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False)
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
print('####'*5)
|
||||
res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini')
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
print('####'*5)
|
||||
res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
from sklearn.datasets import make_regression
|
||||
import pandas as pd
|
||||
loader = make_regression()
|
||||
df = pd.DataFrame(loader[0])
|
||||
df['target'] = loader[1]
|
||||
|
||||
df[df.columns[0]] = df[df.columns[0]].astype(str)
|
||||
df[df.columns[1]] = df[df.columns[1]].astype(int)
|
||||
# df['target'] = df['target'].astype(str)
|
||||
|
||||
print(df)
|
||||
print('####' * 5)
|
||||
res = linear_regression(df, 'target', test_size=0.25, )
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
print('####' * 5)
|
||||
res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error')
|
||||
print(res['te_pred_prob'])
|
||||
|
||||
print('####' * 5)
|
||||
res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
|
||||
print(res['te_pred_prob'])
|
||||
run()
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2023/11/16 16:37
|
||||
# @Author : lidanyang
|
||||
# @File : __init__.py
|
||||
# @Desc :
|
||||
|
|
@ -1,78 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2023/11/16 16:38
|
||||
# @Author : lidanyang
|
||||
# @File : register.py
|
||||
# @Desc :
|
||||
import inspect
|
||||
from typing import Type, Optional, Callable, Dict, Union, List
|
||||
|
||||
from metagpt.tools.functions.schemas.base import ToolSchema
|
||||
|
||||
|
||||
class FunctionRegistry:
|
||||
def __init__(self):
|
||||
self.functions: Dict[str, Dict[str, Dict]] = {}
|
||||
|
||||
@staticmethod
|
||||
def _check_param_consistency(func_params, schema):
|
||||
param_names = set(func_params.keys())
|
||||
schema_names = set(schema["parameters"]["properties"].keys())
|
||||
|
||||
if param_names != schema_names:
|
||||
raise ValueError("Function parameters do not match schema properties")
|
||||
|
||||
def register(self, module: str, tool_schema: Type[ToolSchema]) -> Callable:
|
||||
def wrapper(func: Callable) -> Callable:
|
||||
module_registry = self.functions.setdefault(module, {})
|
||||
|
||||
if func.__name__ in module_registry:
|
||||
raise ValueError(f"Function {func.__name__} is already registered in {module}")
|
||||
|
||||
func_params = inspect.signature(func).parameters
|
||||
|
||||
schema = tool_schema.schema()
|
||||
schema["name"] = func.__name__
|
||||
|
||||
self._check_param_consistency(func_params, schema)
|
||||
|
||||
module_registry[func.__name__] = {
|
||||
"func": func,
|
||||
"schema": schema,
|
||||
}
|
||||
return func
|
||||
|
||||
return wrapper
|
||||
|
||||
def get(self, module: str, name: str) -> Optional[Union[Callable, Dict]]:
|
||||
"""Get function by module and name"""
|
||||
module_registry = self.functions.get(module, {})
|
||||
return module_registry.get(name)
|
||||
|
||||
def get_by_name(self, name: str) -> Optional[Dict]:
|
||||
"""Get function by name"""
|
||||
for module_registry in self.functions.values():
|
||||
if name in module_registry:
|
||||
return module_registry.get(name, {})
|
||||
|
||||
def get_all_by_module(self, module: str) -> Optional[Dict]:
|
||||
"""Get all functions by module"""
|
||||
return self.functions.get(module, {})
|
||||
|
||||
def get_schema(self, module: str, name: str) -> Optional[Dict]:
|
||||
"""Get schema by module and name"""
|
||||
module_registry = self.functions.get(module, {})
|
||||
return module_registry.get(name, {}).get("schema")
|
||||
|
||||
def get_schemas(self, module: str, names: List[str]) -> List[Dict]:
|
||||
"""Get schemas by module and names"""
|
||||
module_registry = self.functions.get(module, {})
|
||||
return [module_registry.get(name, {}).get("schema") for name in names]
|
||||
|
||||
def get_all_schema_by_module(self, module: str) -> List[Dict]:
|
||||
"""Get all schemas by module"""
|
||||
module_registry = self.functions.get(module, {})
|
||||
return [v.get("schema") for v in module_registry.values()]
|
||||
|
||||
|
||||
registry = FunctionRegistry()
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2023/11/16 16:34
|
||||
# @Author : lidanyang
|
||||
# @File : base.py
|
||||
# @Desc : Build base class to generate schema for tool
|
||||
from typing import Any, List, Optional, get_type_hints
|
||||
|
||||
|
||||
class NoDefault:
|
||||
"""
|
||||
A class to represent a missing default value.
|
||||
|
||||
This is used to distinguish between a default value of None and a missing default value.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def tool_field(
|
||||
description: str, default: Any = NoDefault(), enum: Optional[List[Any]] = None, **kwargs
|
||||
):
|
||||
"""
|
||||
Create a field for a tool parameter.
|
||||
|
||||
Args:
|
||||
description (str): A description of the field.
|
||||
default (Any, optional): The default value for the field. Defaults to None.
|
||||
enum (Optional[List[Any]], optional): A list of possible values for the field. Defaults to None.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary representing the field with provided attributes.
|
||||
"""
|
||||
field_info = {
|
||||
"description": description,
|
||||
"default": default,
|
||||
"enum": enum,
|
||||
}
|
||||
field_info.update(kwargs)
|
||||
return field_info
|
||||
|
||||
|
||||
class ToolSchema:
|
||||
@staticmethod
|
||||
def format_type(type_hint):
|
||||
"""
|
||||
Format a type hint into a string representation.
|
||||
|
||||
Args:
|
||||
type_hint (type): The type hint to format.
|
||||
|
||||
Returns:
|
||||
str: A string representation of the type hint.
|
||||
"""
|
||||
if isinstance(type_hint, type):
|
||||
# Handle built-in types separately
|
||||
if type_hint.__module__ == "builtins":
|
||||
return type_hint.__name__
|
||||
else:
|
||||
return f"{type_hint.__module__}.{type_hint.__name__}"
|
||||
elif hasattr(type_hint, "__origin__") and hasattr(type_hint, "__args__"):
|
||||
# Handle generic types (like List[int])
|
||||
origin_type = ToolSchema.format_type(type_hint.__origin__)
|
||||
args_type = ", ".join(
|
||||
[ToolSchema.format_type(t) for t in type_hint.__args__]
|
||||
)
|
||||
return f"{origin_type}[{args_type}]"
|
||||
else:
|
||||
return str(type_hint)
|
||||
|
||||
@classmethod
|
||||
def schema(cls):
|
||||
"""
|
||||
Generate a schema dictionary for the class.
|
||||
|
||||
The schema includes the class name, description, and information about
|
||||
each class parameter based on type hints and field definitions.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary representing the schema of the class.
|
||||
"""
|
||||
schema = {
|
||||
"name": cls.__name__,
|
||||
"description": cls.__doc__,
|
||||
"parameters": {"type": "object", "properties": {}, "required": []},
|
||||
}
|
||||
type_hints = get_type_hints(cls)
|
||||
for attr, type_hint in type_hints.items():
|
||||
value = getattr(cls, attr, None)
|
||||
if isinstance(value, dict):
|
||||
# Process each attribute that is defined using the field function
|
||||
prop_info = {k: v for k, v in value.items() if v is not None or k == "default"}
|
||||
if isinstance(prop_info["default"], NoDefault):
|
||||
del prop_info["default"]
|
||||
prop_info["type"] = ToolSchema.format_type(type_hint)
|
||||
schema["parameters"]["properties"][attr] = prop_info
|
||||
# Check for required fields
|
||||
if "default" not in prop_info:
|
||||
schema["parameters"]["required"].append(attr)
|
||||
return schema
|
||||
|
|
@ -1,62 +0,0 @@
|
|||
|
||||
import pandas as pd
|
||||
|
||||
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
|
||||
|
||||
|
||||
class FillMissingValue(ToolSchema):
|
||||
"""Completing missing values with simple strategies"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
strategy: str = tool_field(description="the imputation strategy", default='mean')
|
||||
fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
|
||||
|
||||
|
||||
# class LabelEncode(ToolSchema):
|
||||
# """Completing missing values with simple strategies"""
|
||||
# df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
# features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class SplitBins(ToolSchema):
|
||||
"""Bin continuous data into intervals and return the bin identifier encoded as an integer value"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile')
|
||||
|
||||
|
||||
class MinMaxScale(ToolSchema):
|
||||
"""Transform features by scaling each feature to a range, witch is (0, 1)"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class StandardScale(ToolSchema):
|
||||
"""Standardize features by removing the mean and scaling to unit variance"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class LogTransform(ToolSchema):
|
||||
"""Performs a logarithmic transformation on the specified columns"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class MaxAbsScale(ToolSchema):
|
||||
"""Scale each feature by its maximum absolute value"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class RobustScale(ToolSchema):
|
||||
"""Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
|
||||
class OrdinalEncode(ToolSchema):
|
||||
"""Encode categorical features as an integer array"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
features: list = tool_field(description="columns to be processed")
|
||||
|
||||
306
metagpt/tools/functions/schemas/data_preprocess.yml
Normal file
306
metagpt/tools/functions/schemas/data_preprocess.yml
Normal file
|
|
@ -0,0 +1,306 @@
|
|||
FillMissingValue:
|
||||
type: class
|
||||
description: "Completing missing values with simple strategies"
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
features:
|
||||
type: list
|
||||
description: "columns to be processed"
|
||||
strategy:
|
||||
type: str
|
||||
description: "the imputation strategy"
|
||||
default: mean
|
||||
enum:
|
||||
- mean
|
||||
- median
|
||||
- most_frequent
|
||||
- constant
|
||||
fill_value:
|
||||
type: int
|
||||
description: "fill_value is used to replace all occurrences of missing_values"
|
||||
default: null
|
||||
required:
|
||||
- features
|
||||
fit:
|
||||
description: "Fit the FillMissingValue model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
MinMaxScale:
|
||||
type: class
|
||||
description: "Transform features by scaling each feature to a range, witch is (0, 1)"
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
features:
|
||||
type: list
|
||||
description: "columns to be processed"
|
||||
required:
|
||||
- features
|
||||
fit:
|
||||
description: "Fit the MinMaxScale model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
StandardScale:
|
||||
type: class
|
||||
description: "Standardize features by removing the mean and scaling to unit variance"
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
features:
|
||||
type: list
|
||||
description: "columns to be processed"
|
||||
required:
|
||||
- features
|
||||
fit:
|
||||
description: "Fit the StandardScale model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
MaxAbsScale:
|
||||
type: class
|
||||
description: "cale each feature by its maximum absolute value"
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
features:
|
||||
type: list
|
||||
description: "columns to be processed"
|
||||
required:
|
||||
- features
|
||||
fit:
|
||||
description: "Fit the MaxAbsScale model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
LabelEncode:
|
||||
type: class
|
||||
description: "Apply label encoding to specified categorical columns in-place."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
features:
|
||||
type: list
|
||||
description: "Categorical columns to be label encoded"
|
||||
required:
|
||||
- features
|
||||
fit:
|
||||
description: "Fit the LabelEncode model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
OneHotEncode:
|
||||
type: class
|
||||
description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
features:
|
||||
type: list
|
||||
description: "Categorical columns to be one-hot encoded and dropped"
|
||||
required:
|
||||
- features
|
||||
fit:
|
||||
description: "Fit the OneHotEncoding model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2023/11/17 10:34
|
||||
# @Author : lidanyang
|
||||
# @File : feature_engineering.py
|
||||
# @Desc : Schema for feature engineering functions
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from metagpt.tools.functions.schemas.base import ToolSchema, tool_field
|
||||
|
||||
|
||||
class PolynomialExpansion(ToolSchema):
|
||||
"""Generate polynomial and interaction features from selected columns, excluding the bias column."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
cols: list = tool_field(description="Columns for polynomial expansion.")
|
||||
degree: int = tool_field(description="Degree of polynomial features.", default=2)
|
||||
|
||||
|
||||
class OneHotEncoding(ToolSchema):
|
||||
"""Apply one-hot encoding to specified categorical columns in a DataFrame."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
cols: list = tool_field(description="Categorical columns to be one-hot encoded.")
|
||||
|
||||
|
||||
class FrequencyEncoding(ToolSchema):
|
||||
"""Convert categorical columns to frequency encoding."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
cols: list = tool_field(description="Categorical columns to be frequency encoded.")
|
||||
|
||||
|
||||
class CatCross(ToolSchema):
|
||||
"""Create pairwise crossed features from categorical columns, joining values with '_'."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
cols: list = tool_field(description="Columns to be pairwise crossed.")
|
||||
max_cat_num: int = tool_field(
|
||||
description="Maximum unique categories per crossed feature.", default=100
|
||||
)
|
||||
|
||||
|
||||
class GroupStat(ToolSchema):
|
||||
"""Perform aggregation operations on a specified column grouped by certain categories."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
group_col: str = tool_field(description="Column used for grouping.")
|
||||
agg_col: str = tool_field(description="Column on which aggregation is performed.")
|
||||
agg_funcs: list = tool_field(
|
||||
description="""List of aggregation functions to apply, such as ['mean', 'std'].
|
||||
Each function must be supported by pandas."""
|
||||
)
|
||||
|
||||
|
||||
class ExtractTimeComps(ToolSchema):
|
||||
"""Extract specific time components from a designated time column in a DataFrame."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
time_col: str = tool_field(
|
||||
description="The name of the column containing time data."
|
||||
)
|
||||
time_comps: List[str] = tool_field(
|
||||
description="""List of time components to extract.
|
||||
Each component must be in ['year', 'month', 'day', 'hour', 'dayofweek', 'is_weekend']."""
|
||||
)
|
||||
|
||||
|
||||
class FeShiftByTime(ToolSchema):
|
||||
"""Shift column values in a DataFrame based on specified time intervals."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
time_col: str = tool_field(description="Column for time-based shifting.")
|
||||
group_col: str = tool_field(description="Column for grouping before shifting.")
|
||||
shift_col: str = tool_field(description="Column to shift.")
|
||||
periods: list = tool_field(description="Time intervals for shifting.")
|
||||
freq: str = tool_field(
|
||||
description="Frequency unit for time intervals (e.g., 'D', 'M').",
|
||||
enum=["D", "M", "Y", "W", "H"],
|
||||
)
|
||||
|
||||
|
||||
class FeRollingByTime(ToolSchema):
|
||||
"""Calculate rolling statistics for a DataFrame column over time intervals."""
|
||||
|
||||
df: pd.DataFrame = tool_field(description="DataFrame to process.")
|
||||
time_col: str = tool_field(description="Column for time-based rolling.")
|
||||
group_col: str = tool_field(description="Column for grouping before rolling.")
|
||||
rolling_col: str = tool_field(description="Column for rolling calculations.")
|
||||
periods: list = tool_field(description="Window sizes for rolling.")
|
||||
freq: str = tool_field(
|
||||
description="Frequency unit for time windows (e.g., 'D', 'M').",
|
||||
enum=["D", "M", "Y", "W", "H"],
|
||||
)
|
||||
agg_funcs: list = tool_field(
|
||||
description="""List of aggregation functions for rolling, like ['mean', 'std'].
|
||||
Each function must be in ['mean', 'std', 'min', 'max', 'median', 'sum', 'count']."""
|
||||
)
|
||||
433
metagpt/tools/functions/schemas/feature_engineering.yml
Normal file
433
metagpt/tools/functions/schemas/feature_engineering.yml
Normal file
|
|
@ -0,0 +1,433 @@
|
|||
PolynomialExpansion:
|
||||
type: class
|
||||
description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
cols:
|
||||
type: list
|
||||
description: "Columns for polynomial expansion."
|
||||
degree:
|
||||
type: int
|
||||
description: "The degree of the polynomial features."
|
||||
default: 2
|
||||
required:
|
||||
- cols
|
||||
fit:
|
||||
description: "Fit the PolynomialExpansion model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
CatCount:
|
||||
type: class
|
||||
description: "Add value counts of a categorical column as new feature."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
col:
|
||||
type: str
|
||||
description: "Column for value counts."
|
||||
required:
|
||||
- col
|
||||
fit:
|
||||
description: "Fit the CatCount model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
TargetMeanEncoder:
|
||||
type: class
|
||||
description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
col:
|
||||
type: str
|
||||
description: "Column to be mean encoded."
|
||||
label:
|
||||
type: str
|
||||
description: "Predicted label column."
|
||||
required:
|
||||
- col
|
||||
- label
|
||||
fit:
|
||||
description: "Fit the TargetMeanEncoder model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
KFoldTargetMeanEncoder:
|
||||
type: class
|
||||
description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
col:
|
||||
type: str
|
||||
description: "Column to be k-fold mean encoded."
|
||||
label:
|
||||
type: str
|
||||
description: "Predicted label column."
|
||||
n_splits:
|
||||
type: int
|
||||
description: "Number of splits for K-fold."
|
||||
default: 5
|
||||
random_state:
|
||||
type: int
|
||||
description: "Random seed."
|
||||
default: 2021
|
||||
required:
|
||||
- col
|
||||
- label
|
||||
fit:
|
||||
description: "Fit the KFoldTargetMeanEncoder model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
CatCross:
|
||||
type: class
|
||||
description: "Add pairwise crossed features and convert them to numerical features."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
cols:
|
||||
type: list
|
||||
description: "Columns to be pairwise crossed."
|
||||
max_cat_num:
|
||||
type: int
|
||||
description: "Maximum unique categories per crossed feature."
|
||||
default: 100
|
||||
required:
|
||||
- cols
|
||||
fit:
|
||||
description: "Fit the CatCross model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
GroupStat:
|
||||
type: class
|
||||
description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
group_col:
|
||||
type: str
|
||||
description: "Column used for grouping."
|
||||
agg_col:
|
||||
type: str
|
||||
description: "Column on which aggregation is performed."
|
||||
agg_funcs:
|
||||
type: list
|
||||
description: >-
|
||||
List of aggregation functions to apply, such as ['mean', 'std'].
|
||||
Each function must be supported by pandas.
|
||||
required:
|
||||
- group_col
|
||||
- agg_col
|
||||
- agg_funcs
|
||||
fit:
|
||||
description: "Fit the GroupStat model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
SplitBins:
|
||||
type: class
|
||||
description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
cols:
|
||||
type: list
|
||||
description: "Columns to be binned inplace."
|
||||
strategy:
|
||||
type: str
|
||||
description: "Strategy used to define the widths of the bins."
|
||||
default: quantile
|
||||
enum:
|
||||
- quantile
|
||||
- uniform
|
||||
- kmeans
|
||||
required:
|
||||
- cols
|
||||
fit:
|
||||
description: "Fit the SplitBins model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
GeneralSelection:
|
||||
type: class
|
||||
description: "Drop all nan feats and feats with only one unique value."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
label_col:
|
||||
type: str
|
||||
description: "Label column name."
|
||||
required:
|
||||
- label_col
|
||||
fit:
|
||||
description: "Fit the GeneralSelection model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
|
|
@ -1,55 +0,0 @@
|
|||
import pandas as pd
|
||||
|
||||
from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
|
||||
|
||||
|
||||
class LogisticRegressionClassification(ToolSchema):
|
||||
"""Logistic Regression (aka logit, MaxEnt) classifier"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
penalty: str = tool_field(description="Specify the norm of the penalty", default="l2")
|
||||
dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2")
|
||||
|
||||
|
||||
class RandomForestClassification(ToolSchema):
|
||||
"""random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
|
||||
criterion: str = tool_field(description="The function to measure the quality of a split", default="gini")
|
||||
|
||||
|
||||
class GradientBoostingClassification(ToolSchema):
|
||||
"""Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
|
||||
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
|
||||
|
||||
|
||||
class LinearRegressionRegression(ToolSchema):
|
||||
"""Ordinary least squares Linear Regression."""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
|
||||
|
||||
class RandomForestRegression(ToolSchema):
|
||||
"""random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
|
||||
criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error")
|
||||
|
||||
|
||||
class GradientBoostingRegression(ToolSchema):
|
||||
"""Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion"""
|
||||
df: pd.DataFrame = tool_field(description="input dataframe")
|
||||
label: str = tool_field(description="target name")
|
||||
test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
|
||||
n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
|
||||
learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
|
||||
Loading…
Add table
Add a link
Reference in a new issue