mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-04-28 18:36:22 +02:00
Merge branch 'dev' into 'dev_make_tools'
# Conflicts: # metagpt/roles/ml_engineer.py
This commit is contained in:
commit
c76c1765ef
6 changed files with 276 additions and 55 deletions
|
|
@ -1,3 +1,5 @@
|
|||
import json
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
|
@ -20,10 +22,14 @@ class FillMissingValue(MLProcess):
|
|||
self.si = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
if len(self.features) == 0:
|
||||
return
|
||||
self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
|
||||
self.si.fit(df[self.features])
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
if len(self.features) == 0:
|
||||
return df
|
||||
df[self.features] = self.si.transform(df[self.features])
|
||||
return df
|
||||
|
||||
|
|
@ -122,11 +128,15 @@ class LabelEncode(MLProcess):
|
|||
self.le_encoders = []
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
if len(self.features) == 0:
|
||||
return
|
||||
for col in self.features:
|
||||
le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ['unknown'])
|
||||
self.le_encoders.append(le)
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
if len(self.features) == 0:
|
||||
return df
|
||||
for i in range(len(self.features)):
|
||||
data_list = df[self.features[i]].astype(str).tolist()
|
||||
for unique_item in np.unique(df[self.features[i]].astype(str)):
|
||||
|
|
@ -137,17 +147,23 @@ class LabelEncode(MLProcess):
|
|||
|
||||
|
||||
def get_column_info(df: pd.DataFrame) -> dict:
|
||||
data = []
|
||||
for i in df.columns:
|
||||
nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
|
||||
n_unique = df[i].nunique()
|
||||
data_type = str(df[i].dtype).replace("dtype('", "").replace("')", "")
|
||||
if data_type == "O":
|
||||
data_type = "object"
|
||||
data.append([i, data_type, nan_freq, n_unique])
|
||||
column_info = {
|
||||
"Category": [],
|
||||
"Numeric": [],
|
||||
"Datetime": [],
|
||||
"Others": [],
|
||||
}
|
||||
for col in df.columns:
|
||||
data_type = str(df[col].dtype).replace("dtype('", "").replace("')", "")
|
||||
if data_type.startswith("object"):
|
||||
column_info["Category"].append(col)
|
||||
elif data_type.startswith("int") or data_type.startswith("float"):
|
||||
column_info["Numeric"].append(col)
|
||||
elif data_type.startswith("datetime"):
|
||||
column_info["Datetime"].append(col)
|
||||
else:
|
||||
column_info["Others"].append(col)
|
||||
|
||||
samples = pd.DataFrame(
|
||||
data,
|
||||
columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
|
||||
)
|
||||
return samples.to_dict(orient='list')
|
||||
if len(json.dumps(column_info)) > 2000:
|
||||
column_info['Numeric'] = column_info['Numeric'][0:5] + ['Too many cols, omission here...']
|
||||
return column_info
|
||||
|
|
|
|||
|
|
@ -6,12 +6,12 @@
|
|||
# @Desc : Feature Engineering Tools
|
||||
import itertools
|
||||
|
||||
import lightgbm as lgb
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from joblib import Parallel, delayed
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
from pandas.core.dtypes.common import is_object_dtype
|
||||
from sklearn.feature_selection import VarianceThreshold
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
|
||||
|
||||
|
|
@ -19,15 +19,27 @@ from metagpt.tools.functions.libs.base import MLProcess
|
|||
|
||||
|
||||
class PolynomialExpansion(MLProcess):
|
||||
def __init__(self, cols: list, degree: int = 2):
|
||||
def __init__(self, cols: list, degree: int = 2, label_col: str = None):
|
||||
self.cols = cols
|
||||
self.degree = degree
|
||||
self.label_col = label_col
|
||||
if self.label_col in self.cols:
|
||||
self.cols.remove(self.label_col)
|
||||
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
if len(self.cols) == 0:
|
||||
return
|
||||
if len(self.cols) > 10:
|
||||
corr = df[self.cols + [self.label_col]].corr()
|
||||
corr = corr[self.label_col].abs().sort_values(ascending=False)
|
||||
self.cols = corr.index.tolist()[1:11]
|
||||
|
||||
self.poly.fit(df[self.cols].fillna(0))
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
if len(self.cols) == 0:
|
||||
return df
|
||||
ts_data = self.poly.transform(df[self.cols].fillna(0))
|
||||
column_name = self.poly.get_feature_names_out(self.cols)
|
||||
ts_data = pd.DataFrame(ts_data, index=df.index, columns=column_name)
|
||||
|
|
@ -158,27 +170,35 @@ class SplitBins(MLProcess):
|
|||
df[self.cols] = self.encoder.transform(df[self.cols].fillna(0))
|
||||
return df
|
||||
|
||||
# @registry.register("feature_engineering", ExtractTimeComps)
|
||||
# def extract_time_comps(df, time_col, time_comps):
|
||||
# time_s = pd.to_datetime(df[time_col], errors="coerce")
|
||||
# time_comps_df = pd.DataFrame()
|
||||
#
|
||||
# if "year" in time_comps:
|
||||
# time_comps_df["year"] = time_s.dt.year
|
||||
# if "month" in time_comps:
|
||||
# time_comps_df["month"] = time_s.dt.month
|
||||
# if "day" in time_comps:
|
||||
# time_comps_df["day"] = time_s.dt.day
|
||||
# if "hour" in time_comps:
|
||||
# time_comps_df["hour"] = time_s.dt.hour
|
||||
# if "dayofweek" in time_comps:
|
||||
# time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
|
||||
# if "is_weekend" in time_comps:
|
||||
# time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
|
||||
# df = pd.concat([df, time_comps_df], axis=1)
|
||||
# return df
|
||||
#
|
||||
#
|
||||
|
||||
class ExtractTimeComps(MLProcess):
|
||||
def __init__(self, time_col: str, time_comps: list):
|
||||
self.time_col = time_col
|
||||
self.time_comps = time_comps
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
pass
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
time_s = pd.to_datetime(df[self.time_col], errors="coerce")
|
||||
time_comps_df = pd.DataFrame()
|
||||
|
||||
if "year" in self.time_comps:
|
||||
time_comps_df["year"] = time_s.dt.year
|
||||
if "month" in self.time_comps:
|
||||
time_comps_df["month"] = time_s.dt.month
|
||||
if "day" in self.time_comps:
|
||||
time_comps_df["day"] = time_s.dt.day
|
||||
if "hour" in self.time_comps:
|
||||
time_comps_df["hour"] = time_s.dt.hour
|
||||
if "dayofweek" in self.time_comps:
|
||||
time_comps_df["dayofweek"] = time_s.dt.dayofweek + 1
|
||||
if "is_weekend" in self.time_comps:
|
||||
time_comps_df["is_weekend"] = time_s.dt.dayofweek.isin([5, 6]).astype(int)
|
||||
df = pd.concat([df, time_comps_df], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
# @registry.register("feature_engineering", FeShiftByTime)
|
||||
# def fe_shift_by_time(df, time_col, group_col, shift_col, periods, freq):
|
||||
# df[time_col] = pd.to_datetime(df[time_col])
|
||||
|
|
@ -290,3 +310,66 @@ class GeneralSelection(MLProcess):
|
|||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df[self.feats + [self.label_col]]
|
||||
return df
|
||||
|
||||
|
||||
class TreeBasedSelection(MLProcess):
|
||||
def __init__(self, label_col: str, task_type: str):
|
||||
self.label_col = label_col
|
||||
self.task_type = task_type
|
||||
self.feats = None
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
params = {
|
||||
'boosting_type': 'gbdt',
|
||||
'objective': 'binary',
|
||||
'learning_rate': 0.1,
|
||||
'num_leaves': 31,
|
||||
}
|
||||
|
||||
if self.task_type == "cls":
|
||||
params["objective"] = "binary"
|
||||
params["metric"] = "auc"
|
||||
elif self.task_type == "mcls":
|
||||
params["objective"] = "multiclass"
|
||||
params["num_class"] = df[self.label_col].nunique()
|
||||
params["metric"] = "auc_mu"
|
||||
elif self.task_type == "reg":
|
||||
params["objective"] = "regression"
|
||||
params["metric"] = "rmse"
|
||||
|
||||
num_cols = df.select_dtypes(include=np.number).columns.tolist()
|
||||
cols = [f for f in num_cols if f not in [self.label_col]]
|
||||
|
||||
dtrain = lgb.Dataset(df[cols], df[self.label_col])
|
||||
model = lgb.train(params, dtrain, num_boost_round=100)
|
||||
df_imp = pd.DataFrame({'feature_name': dtrain.feature_name,
|
||||
'importance': model.feature_importance("gain")})
|
||||
|
||||
df_imp.sort_values("importance", ascending=False, inplace=True)
|
||||
df_imp = df_imp[df_imp["importance"] > 0]
|
||||
self.feats = df_imp['feature_name'].tolist()
|
||||
self.feats.append(self.label_col)
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df[self.feats]
|
||||
return df
|
||||
|
||||
|
||||
class VarianceBasedSelection(MLProcess):
|
||||
def __init__(self, label_col: str, threshold: float = 0):
|
||||
self.label_col = label_col
|
||||
self.threshold = threshold
|
||||
self.feats = None
|
||||
self.selector = VarianceThreshold(threshold=self.threshold)
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
num_cols = df.select_dtypes(include=np.number).columns.tolist()
|
||||
cols = [f for f in num_cols if f not in [self.label_col]]
|
||||
|
||||
self.selector.fit(df[cols])
|
||||
self.feats = df[cols].columns[self.selector.get_support(indices=True)].tolist()
|
||||
self.feats.append(self.label_col)
|
||||
|
||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = df[self.feats]
|
||||
return df
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ FillMissingValue:
|
|||
description: "columns to be processed"
|
||||
strategy:
|
||||
type: str
|
||||
description: "the imputation strategy"
|
||||
description: "the imputation strategy, notice mean/median can only be used for numeric features"
|
||||
default: mean
|
||||
enum:
|
||||
- mean
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
PolynomialExpansion:
|
||||
type: class
|
||||
description: "Add polynomial and interaction features from selected numeric columns, excluding the bias column."
|
||||
description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
|
|
@ -9,12 +9,16 @@ PolynomialExpansion:
|
|||
cols:
|
||||
type: list
|
||||
description: "Columns for polynomial expansion."
|
||||
label_col:
|
||||
type: str
|
||||
description: "Label column name."
|
||||
degree:
|
||||
type: int
|
||||
description: "The degree of the polynomial features."
|
||||
default: 2
|
||||
required:
|
||||
- cols
|
||||
- label_col
|
||||
fit:
|
||||
description: "Fit the PolynomialExpansion model."
|
||||
parameters:
|
||||
|
|
@ -36,14 +40,14 @@ PolynomialExpansion:
|
|||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
description: "The transformed DataFrame without duplicated columns."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
description: "The input DataFrame without duplicated columns."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
|
|
@ -224,7 +228,7 @@ CatCross:
|
|||
properties:
|
||||
cols:
|
||||
type: list
|
||||
description: "Columns to be pairwise crossed."
|
||||
description: "Columns to be pairwise crossed, at least 2 columns."
|
||||
max_cat_num:
|
||||
type: int
|
||||
description: "Maximum unique categories per crossed feature."
|
||||
|
|
@ -430,4 +434,115 @@ GeneralSelection:
|
|||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame."
|
||||
description: "The transformed DataFrame."
|
||||
|
||||
|
||||
TreeBasedSelection:
|
||||
type: class
|
||||
description: "Select features based on tree-based model and remove features with low importance."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
label_col:
|
||||
type: str
|
||||
description: "Label column name."
|
||||
task_type:
|
||||
type: str
|
||||
description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
|
||||
enum:
|
||||
- cls
|
||||
- mcls
|
||||
- reg
|
||||
required:
|
||||
- label_col
|
||||
- task_type
|
||||
fit:
|
||||
description: "Fit the TreeBasedSelection model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame contain label_col."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame contain label_col."
|
||||
|
||||
VarianceBasedSelection:
|
||||
type: class
|
||||
description: "Select features based on variance and remove features with low variance."
|
||||
methods:
|
||||
__init__:
|
||||
description: "Initialize self."
|
||||
parameters:
|
||||
properties:
|
||||
label_col:
|
||||
type: str
|
||||
description: "Label column name."
|
||||
threshold:
|
||||
type: float
|
||||
description: "Threshold for variance."
|
||||
default: 0.0
|
||||
required:
|
||||
- label_col
|
||||
fit:
|
||||
description: "Fit the VarianceBasedSelection model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
transform:
|
||||
description: "Transform the input DataFrame with the fitted model."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame contain label_col."
|
||||
fit_transform:
|
||||
description: "Fit and transform the input DataFrame."
|
||||
parameters:
|
||||
properties:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The input DataFrame."
|
||||
required:
|
||||
- df
|
||||
returns:
|
||||
df:
|
||||
type: DataFrame
|
||||
description: "The transformed DataFrame contain label_col."
|
||||
Loading…
Add table
Add a link
Reference in a new issue