add tool registry

This commit is contained in:
yzlin 2024-01-13 01:28:49 +08:00
parent 224bf820b2
commit 46cd219e81
25 changed files with 1582 additions and 59 deletions

View file

@ -8,17 +8,6 @@
from enum import Enum
from pydantic import BaseModel
from metagpt.const import TOOL_LIBS_PATH
from metagpt.prompts.tool_type import (
DATA_PREPROCESS_PROMPT,
FEATURE_ENGINEERING_PROMPT,
MODEL_TRAIN_PROMPT,
MODEL_EVALUATE_PROMPT,
VISION_PROMPT,
)
class SearchEngineType(Enum):
SERPAPI_GOOGLE = "serpapi"

View file

@ -14,8 +14,13 @@ from sklearn.preprocessing import (
)
from metagpt.tools.functions.libs.base import MLProcess
from metagpt.tools.tool_registry import register_tool
from metagpt.tools.tool_schema import ToolTypeEnum
TOOL_TYPE = ToolTypeEnum.DATA_PREPROCESS.value
@register_tool(tool_type_name=TOOL_TYPE)
class FillMissingValue(MLProcess):
def __init__(
self,
@ -42,6 +47,7 @@ class FillMissingValue(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class MinMaxScale(MLProcess):
def __init__(
self,
@ -60,6 +66,7 @@ class MinMaxScale(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class StandardScale(MLProcess):
def __init__(
self,
@ -78,6 +85,7 @@ class StandardScale(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class MaxAbsScale(MLProcess):
def __init__(
self,
@ -96,6 +104,7 @@ class MaxAbsScale(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class RobustScale(MLProcess):
def __init__(
self,
@ -114,6 +123,7 @@ class RobustScale(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class OrdinalEncode(MLProcess):
def __init__(
self,
@ -132,6 +142,7 @@ class OrdinalEncode(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class OneHotEncode(MLProcess):
def __init__(
self,
@ -153,6 +164,7 @@ class OneHotEncode(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class LabelEncode(MLProcess):
def __init__(
self,
@ -181,6 +193,7 @@ class LabelEncode(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
def get_column_info(df: pd.DataFrame) -> dict:
column_info = {
"Category": [],

View file

@ -6,7 +6,7 @@
# @Desc : Feature Engineering Tools
import itertools
import lightgbm as lgb
# import lightgbm as lgb
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
@ -16,8 +16,13 @@ from sklearn.model_selection import KFold
from sklearn.preprocessing import KBinsDiscretizer, PolynomialFeatures
from metagpt.tools.functions.libs.base import MLProcess
from metagpt.tools.tool_registry import register_tool
from metagpt.tools.tool_schema import ToolTypeEnum
TOOL_TYPE = ToolTypeEnum.FEATURE_ENGINEERING.value
@register_tool(tool_type_name=TOOL_TYPE)
class PolynomialExpansion(MLProcess):
def __init__(self, cols: list, degree: int = 2, label_col: str = None):
self.cols = cols
@ -48,6 +53,7 @@ class PolynomialExpansion(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class CatCount(MLProcess):
def __init__(self, col: str):
self.col = col
@ -62,6 +68,7 @@ class CatCount(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class TargetMeanEncoder(MLProcess):
def __init__(self, col: str, label: str):
self.col = col
@ -77,6 +84,7 @@ class TargetMeanEncoder(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class KFoldTargetMeanEncoder(MLProcess):
def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
self.col = col
@ -103,6 +111,7 @@ class KFoldTargetMeanEncoder(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class CatCross(MLProcess):
def __init__(self, cols: list, max_cat_num: int = 100):
self.cols = cols
@ -138,6 +147,7 @@ class CatCross(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class GroupStat(MLProcess):
def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
self.group_col = group_col
@ -157,6 +167,7 @@ class GroupStat(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class SplitBins(MLProcess):
def __init__(self, cols: list, strategy: str = "quantile"):
self.cols = cols
@ -173,6 +184,7 @@ class SplitBins(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class ExtractTimeComps(MLProcess):
def __init__(self, time_col: str, time_comps: list):
self.time_col = time_col
@ -201,6 +213,7 @@ class ExtractTimeComps(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class GeneralSelection(MLProcess):
def __init__(self, label_col: str):
self.label_col = label_col
@ -228,6 +241,7 @@ class GeneralSelection(MLProcess):
return new_df
# skip for now because lgb is needed
class TreeBasedSelection(MLProcess):
def __init__(self, label_col: str, task_type: str):
self.label_col = label_col
@ -270,6 +284,7 @@ class TreeBasedSelection(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class VarianceBasedSelection(MLProcess):
def __init__(self, label_col: str, threshold: float = 0):
self.label_col = label_col

View file

@ -0,0 +1,61 @@
FillMissingValue:
type: class
description: "Completing missing values with simple strategies"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
strategy:
type: str
description: "the imputation strategy, notice mean/median can only be used for numeric features"
default: mean
enum:
- mean
- median
- most_frequent
- constant
fill_value:
type: int
description: "fill_value is used to replace all occurrences of missing_values"
default: null
required:
- features
fit:
description: "Fit the FillMissingValue model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
LabelEncode:
type: class
description: "Apply label encoding to specified categorical columns in-place."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "Categorical columns to be label encoded"
required:
- features
fit:
description: "Fit the LabelEncode model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
MaxAbsScale:
type: class
description: "cale each feature by its maximum absolute value"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
required:
- features
fit:
description: "Fit the MaxAbsScale model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
MinMaxScale:
type: class
description: "Transform features by scaling each feature to a range, witch is (0, 1)"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
required:
- features
fit:
description: "Fit the MinMaxScale model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
OneHotEncode:
type: class
description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "Categorical columns to be one-hot encoded and dropped"
required:
- features
fit:
description: "Fit the OneHotEncoding model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
StandardScale:
type: class
description: "Standardize features by removing the mean and scaling to unit variance"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
required:
- features
fit:
description: "Fit the StandardScale model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
CatCount:
type: class
description: "Add value counts of a categorical column as new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column for value counts."
required:
- col
fit:
description: "Fit the CatCount model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,52 @@
CatCross:
type: class
description: "Add pairwise crossed features and convert them to numerical features."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be pairwise crossed, at least 2 columns."
max_cat_num:
type: int
description: "Maximum unique categories per crossed feature."
default: 100
required:
- cols
fit:
description: "Fit the CatCross model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
GeneralSelection:
type: class
description: "Drop all nan feats and feats with only one unique value."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
required:
- label_col
fit:
description: "Fit the GeneralSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,58 @@
GroupStat:
type: class
description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
group_col:
type: str
description: "Column used for grouping."
agg_col:
type: str
description: "Column on which aggregation is performed."
agg_funcs:
type: list
description: >-
List of aggregation functions to apply, such as ['mean', 'std'].
Each function must be supported by pandas.
required:
- group_col
- agg_col
- agg_funcs
fit:
description: "Fit the GroupStat model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,60 @@
KFoldTargetMeanEncoder:
type: class
description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be k-fold mean encoded."
label:
type: str
description: "Predicted label column."
n_splits:
type: int
description: "Number of splits for K-fold."
default: 5
random_state:
type: int
description: "Random seed."
default: 2021
required:
- col
- label
fit:
description: "Fit the KFoldTargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,548 @@
PolynomialExpansion:
type: class
description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns for polynomial expansion."
label_col:
type: str
description: "Label column name."
degree:
type: int
description: "The degree of the polynomial features."
default: 2
required:
- cols
- label_col
fit:
description: "Fit the PolynomialExpansion model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame without duplicated columns."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame without duplicated columns."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
CatCount:
type: class
description: "Add value counts of a categorical column as new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column for value counts."
required:
- col
fit:
description: "Fit the CatCount model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
TargetMeanEncoder:
type: class
description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be mean encoded."
label:
type: str
description: "Predicted label column."
required:
- col
- label
fit:
description: "Fit the TargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
KFoldTargetMeanEncoder:
type: class
description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be k-fold mean encoded."
label:
type: str
description: "Predicted label column."
n_splits:
type: int
description: "Number of splits for K-fold."
default: 5
random_state:
type: int
description: "Random seed."
default: 2021
required:
- col
- label
fit:
description: "Fit the KFoldTargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
CatCross:
type: class
description: "Add pairwise crossed features and convert them to numerical features."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be pairwise crossed, at least 2 columns."
max_cat_num:
type: int
description: "Maximum unique categories per crossed feature."
default: 100
required:
- cols
fit:
description: "Fit the CatCross model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
GroupStat:
type: class
description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
group_col:
type: str
description: "Column used for grouping."
agg_col:
type: str
description: "Column on which aggregation is performed."
agg_funcs:
type: list
description: >-
List of aggregation functions to apply, such as ['mean', 'std'].
Each function must be supported by pandas.
required:
- group_col
- agg_col
- agg_funcs
fit:
description: "Fit the GroupStat model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
SplitBins:
type: class
description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be binned inplace."
strategy:
type: str
description: "Strategy used to define the widths of the bins."
default: quantile
enum:
- quantile
- uniform
- kmeans
required:
- cols
fit:
description: "Fit the SplitBins model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
GeneralSelection:
type: class
description: "Drop all nan feats and feats with only one unique value."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
required:
- label_col
fit:
description: "Fit the GeneralSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
TreeBasedSelection:
type: class
description: "Select features based on tree-based model and remove features with low importance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
task_type:
type: str
description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
enum:
- cls
- mcls
- reg
required:
- label_col
- task_type
fit:
description: "Fit the TreeBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
VarianceBasedSelection:
type: class
description: "Select features based on variance and remove features with low variance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
threshold:
type: float
description: "Threshold for variance."
default: 0.0
required:
- label_col
fit:
description: "Fit the VarianceBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."

View file

@ -0,0 +1,56 @@
SplitBins:
type: class
description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be binned inplace."
strategy:
type: str
description: "Strategy used to define the widths of the bins."
default: quantile
enum:
- quantile
- uniform
- kmeans
required:
- cols
fit:
description: "Fit the SplitBins model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,52 @@
TargetMeanEncoder:
type: class
description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be mean encoded."
label:
type: str
description: "Predicted label column."
required:
- col
- label
fit:
description: "Fit the TargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,56 @@
TreeBasedSelection:
type: class
description: "Select features based on tree-based model and remove features with low importance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
task_type:
type: str
description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
enum:
- cls
- mcls
- reg
required:
- label_col
- task_type
fit:
description: "Fit the TreeBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."

View file

@ -0,0 +1,52 @@
VarianceBasedSelection:
type: class
description: "Select features based on variance and remove features with low variance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
threshold:
type: float
description: "Threshold for variance."
default: 0.0
required:
- label_col
fit:
description: "Fit the VarianceBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."

View file

@ -0,0 +1,128 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2023/01/12 17:07
@Author : garylin2099
@File : tool_registry.py
"""
import os
from collections import defaultdict
import inspect
import re
import yaml
from metagpt.tools.tool_schema import ToolType, ToolSchema, Tool
from metagpt.logs import logger
from metagpt.const import TOOL_SCHEMA_PATH
class ToolRegistry:
def __init__(self):
self.tools = {}
self.tool_types = {}
self.tools_by_types = defaultdict(
dict
) # two-layer k-v, {tool_type_name: {tool_name: {...}, ...}, ...}
def register_tool_type(self, tool_type: ToolType):
self.tool_types[tool_type.name] = tool_type
def register_tool(
self,
tool_name,
tool_path,
schema_path=None,
tool_code="",
tool_type_name="other",
make_schema_if_not_exists=False,
):
if self.has_tool(tool_name):
return
schema_path = schema_path or TOOL_SCHEMA_PATH / tool_type_name / f"{tool_name}.yml"
if not os.path.exists(schema_path):
if make_schema_if_not_exists:
logger.warning(f"no schema found, will make schema at {schema_path}")
make_schema(tool_code, schema_path)
else:
logger.warning(f"no schema found at assumed schema_path {schema_path}, skip registering {tool_name}")
return
with open(schema_path, "r", encoding="utf-8") as f:
schema = yaml.safe_load(f)[tool_name]
schema["tool_path"] = tool_path # corresponding code file path of the tool
try:
ToolSchema(**schema) # validation
except Exception as e:
pass
# logger.warning(
# f"{tool_name} schema not conforms to required format, but will be used anyway. Mismatch: {e}"
# )
tool = Tool(name=tool_name, path=tool_path, schema=schema, code=tool_code)
self.tools[tool_name] = tool
self.tools_by_types[tool_type_name][tool_name] = tool
logger.info(f"{tool_name} registered")
def has_tool(self, key):
return key in self.tools
def get_tool(self, key):
return self.tools.get(key)
def get_tools_by_type(self, key):
return self.tools_by_types.get(key)
def has_tool_type(self, key):
return key in self.tool_types
def get_tool_type(self, key):
return self.tool_types.get(key)
def get_tool_types(self):
return self.tool_types
# Registry instance
TOOL_REGISTRY = ToolRegistry()
def register_tool_type(cls):
"""register a tool type to registry"""
TOOL_REGISTRY.register_tool_type(tool_type=cls())
return cls
def register_tool(tool_name="", tool_type_name="other", schema_path=None):
"""register a tool to registry"""
def decorator(cls, tool_name=tool_name):
tool_name = tool_name or cls.__name__
# Get the file path where the function / class is defined and the source code
file_path = inspect.getfile(cls)
if "metagpt" in file_path:
file_path = re.search("metagpt.+", file_path).group(0)
source_code = inspect.getsource(cls)
TOOL_REGISTRY.register_tool(
tool_name=tool_name,
tool_path=file_path,
schema_path=schema_path,
tool_code=source_code,
tool_type_name=tool_type_name,
)
return cls
return decorator
def make_schema(tool_code, path):
os.makedirs(
os.path.dirname(path), exist_ok=True
) # Create the necessary directories
schema = {} # an empty schema for now
with open(path, "w", encoding="utf-8") as f:
yaml.dump(schema, f)
return path

View file

@ -0,0 +1,31 @@
from enum import Enum
from pydantic import BaseModel
class ToolTypeEnum(Enum):
DATA_PREPROCESS = "data_preprocess"
FEATURE_ENGINEERING = "feature_engineering"
MODEL_TRAIN = "model_train"
MODEL_EVALUATE = "model_evaluate"
OTHER = "other"
def __missing__(self, key):
return self.OTHER
class ToolType(BaseModel):
name: str
desc: str
usage_prompt: str = ""
class ToolSchema(BaseModel):
name: str
class Tool(BaseModel):
name: str
path: str
schema: dict = {}
code: str = ""

View file

@ -0,0 +1,43 @@
from metagpt.prompts.tool_type import (
DATA_PREPROCESS_PROMPT,
FEATURE_ENGINEERING_PROMPT,
MODEL_TRAIN_PROMPT,
MODEL_EVALUATE_PROMPT,
)
from metagpt.tools.tool_schema import ToolTypeEnum, ToolType
from metagpt.tools.tool_registry import register_tool_type
@register_tool_type
class DataPreprocess(ToolType):
name: str = ToolTypeEnum.DATA_PREPROCESS.value
desc: str = "Only for changing value inplace."
usage_prompt: str = DATA_PREPROCESS_PROMPT
@register_tool_type
class FeatureEngineer(ToolType):
name: str = ToolTypeEnum.FEATURE_ENGINEERING.value
desc: str = "Only for creating new columns for input data."
usage_prompt: str = FEATURE_ENGINEERING_PROMPT
@register_tool_type
class ModelTrain(ToolType):
name: str = ToolTypeEnum.MODEL_TRAIN.value
desc: str = "Only for training model."
usage_prompt: str = MODEL_TRAIN_PROMPT
@register_tool_type
class ModelEvaluate(ToolType):
name: str = ToolTypeEnum.MODEL_EVALUATE.value
desc: str = "Only for evaluating model."
usage_prompt: str = MODEL_EVALUATE_PROMPT
@register_tool_type
class Other(ToolType):
name: str = ToolTypeEnum.OTHER.value
desc: str = "Any tools not in the defined categories"
usage_prompt: str = ""