add tool registry

2026-07-08 16:12:16 +02:00 · 2024-01-13 01:28:49 +08:00 · 2024-01-13 01:28:49 +08:00 · 46cd219e81
commit 46cd219e81
parent 224bf820b2
25 changed files with 1582 additions and 59 deletions
--- a/metagpt/tools/init.py
+++ b/metagpt/tools/init.py
@ -8,17 +8,6 @@

 from enum import Enum

-from pydantic import BaseModel
-
-from metagpt.const import TOOL_LIBS_PATH
-from metagpt.prompts.tool_type import (
-    DATA_PREPROCESS_PROMPT,
-    FEATURE_ENGINEERING_PROMPT,
-    MODEL_TRAIN_PROMPT,
-    MODEL_EVALUATE_PROMPT,
-    VISION_PROMPT,
-)
-

 class SearchEngineType(Enum):
    SERPAPI_GOOGLE = "serpapi"
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@ -14,8 +14,13 @@ from sklearn.preprocessing import (
 )

 from metagpt.tools.functions.libs.base import MLProcess
+from metagpt.tools.tool_registry import register_tool
+from metagpt.tools.tool_schema import ToolTypeEnum
+
+TOOL_TYPE = ToolTypeEnum.DATA_PREPROCESS.value


+@register_tool(tool_type_name=TOOL_TYPE)
 class FillMissingValue(MLProcess):
    def __init__(
        self,
@ -42,6 +47,7 @@ class FillMissingValue(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class MinMaxScale(MLProcess):
    def __init__(
        self,
@ -60,6 +66,7 @@ class MinMaxScale(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class StandardScale(MLProcess):
    def __init__(
        self,
@ -78,6 +85,7 @@ class StandardScale(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class MaxAbsScale(MLProcess):
    def __init__(
        self,
@ -96,6 +104,7 @@ class MaxAbsScale(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class RobustScale(MLProcess):
    def __init__(
        self,
@ -114,6 +123,7 @@ class RobustScale(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class OrdinalEncode(MLProcess):
    def __init__(
        self,
@ -132,6 +142,7 @@ class OrdinalEncode(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class OneHotEncode(MLProcess):
    def __init__(
        self,
@ -153,6 +164,7 @@ class OneHotEncode(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class LabelEncode(MLProcess):
    def __init__(
        self,
@ -181,6 +193,7 @@ class LabelEncode(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 def get_column_info(df: pd.DataFrame) -> dict:
    column_info = {
        "Category": [],
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@ -6,7 +6,7 @@
 # @Desc    : Feature Engineering Tools
 import itertools

-import lightgbm as lgb
+# import lightgbm as lgb
 import numpy as np
 import pandas as pd
 from joblib import Parallel, delayed
@ -16,8 +16,13 @@ from sklearn.model_selection import KFold
 from sklearn.preprocessing import KBinsDiscretizer, PolynomialFeatures

 from metagpt.tools.functions.libs.base import MLProcess
+from metagpt.tools.tool_registry import register_tool
+from metagpt.tools.tool_schema import ToolTypeEnum
+
+TOOL_TYPE = ToolTypeEnum.FEATURE_ENGINEERING.value


+@register_tool(tool_type_name=TOOL_TYPE)
 class PolynomialExpansion(MLProcess):
    def __init__(self, cols: list, degree: int = 2, label_col: str = None):
        self.cols = cols
@ -48,6 +53,7 @@ class PolynomialExpansion(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class CatCount(MLProcess):
    def __init__(self, col: str):
        self.col = col
@ -62,6 +68,7 @@ class CatCount(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class TargetMeanEncoder(MLProcess):
    def __init__(self, col: str, label: str):
        self.col = col
@ -77,6 +84,7 @@ class TargetMeanEncoder(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class KFoldTargetMeanEncoder(MLProcess):
    def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
        self.col = col
@ -103,6 +111,7 @@ class KFoldTargetMeanEncoder(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class CatCross(MLProcess):
    def __init__(self, cols: list, max_cat_num: int = 100):
        self.cols = cols
@ -138,6 +147,7 @@ class CatCross(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class GroupStat(MLProcess):
    def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
        self.group_col = group_col
@ -157,6 +167,7 @@ class GroupStat(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class SplitBins(MLProcess):
    def __init__(self, cols: list, strategy: str = "quantile"):
        self.cols = cols
@ -173,6 +184,7 @@ class SplitBins(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class ExtractTimeComps(MLProcess):
    def __init__(self, time_col: str, time_comps: list):
        self.time_col = time_col
@ -201,6 +213,7 @@ class ExtractTimeComps(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class GeneralSelection(MLProcess):
    def __init__(self, label_col: str):
        self.label_col = label_col
@ -228,6 +241,7 @@ class GeneralSelection(MLProcess):
        return new_df


+# skip for now because lgb is needed
 class TreeBasedSelection(MLProcess):
    def __init__(self, label_col: str, task_type: str):
        self.label_col = label_col
@ -270,6 +284,7 @@ class TreeBasedSelection(MLProcess):
        return new_df


+@register_tool(tool_type_name=TOOL_TYPE)
 class VarianceBasedSelection(MLProcess):
    def __init__(self, label_col: str, threshold: float = 0):
        self.label_col = label_col
--- a/metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml
@ -0,0 +1,61 @@
+FillMissingValue:
+  type: class
+  description: "Completing missing values with simple strategies"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+          strategy:
+            type: str
+            description: "the imputation strategy, notice mean/median can only be used for numeric features"
+            default: mean
+            enum:
+              - mean
+              - median
+              - most_frequent
+              - constant
+          fill_value:
+            type: int
+            description: "fill_value is used to replace all occurrences of missing_values"
+            default: null
+        required:
+          - features
+    fit:
+      description: "Fit the FillMissingValue model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml
@ -0,0 +1,48 @@
+LabelEncode:
+  type: class
+  description: "Apply label encoding to specified categorical columns in-place."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "Categorical columns to be label encoded"
+        required:
+          - features
+    fit:
+      description: "Fit the LabelEncode model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml
@ -0,0 +1,48 @@
+MaxAbsScale:
+  type: class
+  description: "cale each feature by its maximum absolute value"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the MaxAbsScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml
@ -0,0 +1,48 @@
+MinMaxScale:
+  type: class
+  description: "Transform features by scaling each feature to a range, witch is (0, 1)"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the MinMaxScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml
@ -0,0 +1,48 @@
+OneHotEncode:
+  type: class
+  description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "Categorical columns to be one-hot encoded and dropped"
+        required:
+          - features
+    fit:
+      description: "Fit the OneHotEncoding model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml
+++ b/metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml
@ -0,0 +1,48 @@
+StandardScale:
+  type: class
+  description: "Standardize features by removing the mean and scaling to unit variance"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the StandardScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/feature_engineering/CatCount.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/CatCount.yml
@ -0,0 +1,48 @@
+CatCount:
+  type: class
+  description: "Add value counts of a categorical column as new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column for value counts."
+        required:
+          - col
+    fit:
+      description: "Fit the CatCount model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/feature_engineering/CatCross.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/CatCross.yml
@ -0,0 +1,52 @@
+CatCross:
+  type: class
+  description: "Add pairwise crossed features and convert them to numerical features."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be pairwise crossed, at least 2 columns."
+          max_cat_num:
+            type: int
+            description: "Maximum unique categories per crossed feature."
+            default: 100
+      required:
+        - cols
+    fit:
+      description: "Fit the CatCross model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml
@ -0,0 +1,48 @@
+GeneralSelection:
+  type: class
+  description: "Drop all nan feats and feats with only one unique value."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+        required:
+          - label_col
+    fit:
+      description: "Fit the GeneralSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml
@ -0,0 +1,58 @@
+GroupStat:
+  type: class
+  description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          group_col:
+            type: str
+            description: "Column used for grouping."
+          agg_col:
+            type: str
+            description: "Column on which aggregation is performed."
+          agg_funcs:
+            type: list
+            description: >-
+              List of aggregation functions to apply, such as ['mean', 'std'].
+              Each function must be supported by pandas.
+        required:
+          - group_col
+          - agg_col
+          - agg_funcs
+    fit:
+      description: "Fit the GroupStat model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
@ -0,0 +1,60 @@
+KFoldTargetMeanEncoder:
+  type: class
+  description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be k-fold mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+          n_splits:
+            type: int
+            description: "Number of splits for K-fold."
+            default: 5
+          random_state:
+            type: int
+            description: "Random seed."
+            default: 2021
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the KFoldTargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml
@ -0,0 +1,548 @@
+PolynomialExpansion:
+  type: class
+  description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns for polynomial expansion."
+          label_col:
+            type: str
+            description: "Label column name."
+          degree:
+            type: int
+            description: "The degree of the polynomial features."
+            default: 2
+        required:
+          - cols
+          - label_col
+    fit:
+      description: "Fit the PolynomialExpansion model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame without duplicated columns."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame without duplicated columns."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+CatCount:
+  type: class
+  description: "Add value counts of a categorical column as new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column for value counts."
+        required:
+          - col
+    fit:
+      description: "Fit the CatCount model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+TargetMeanEncoder:
+  type: class
+  description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the TargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+KFoldTargetMeanEncoder:
+  type: class
+  description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be k-fold mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+          n_splits:
+            type: int
+            description: "Number of splits for K-fold."
+            default: 5
+          random_state:
+            type: int
+            description: "Random seed."
+            default: 2021
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the KFoldTargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+CatCross:
+  type: class
+  description: "Add pairwise crossed features and convert them to numerical features."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be pairwise crossed, at least 2 columns."
+          max_cat_num:
+            type: int
+            description: "Maximum unique categories per crossed feature."
+            default: 100
+      required:
+        - cols
+    fit:
+      description: "Fit the CatCross model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+GroupStat:
+  type: class
+  description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          group_col:
+            type: str
+            description: "Column used for grouping."
+          agg_col:
+            type: str
+            description: "Column on which aggregation is performed."
+          agg_funcs:
+            type: list
+            description: >-
+              List of aggregation functions to apply, such as ['mean', 'std'].
+              Each function must be supported by pandas.
+        required:
+          - group_col
+          - agg_col
+          - agg_funcs
+    fit:
+      description: "Fit the GroupStat model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+SplitBins:
+  type: class
+  description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be binned inplace."
+          strategy:
+            type: str
+            description: "Strategy used to define the widths of the bins."
+            default: quantile
+            enum:
+              - quantile
+              - uniform
+              - kmeans
+        required:
+          - cols
+    fit:
+      description: "Fit the SplitBins model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+GeneralSelection:
+  type: class
+  description: "Drop all nan feats and feats with only one unique value."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+        required:
+          - label_col
+    fit:
+      description: "Fit the GeneralSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+
+TreeBasedSelection:
+  type: class
+  description: "Select features based on tree-based model and remove features with low importance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          task_type:
+            type: str
+            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
+            enum:
+              - cls
+              - mcls
+              - reg
+        required:
+          - label_col
+          - task_type
+    fit:
+      description: "Fit the TreeBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+
+VarianceBasedSelection:
+  type: class
+  description: "Select features based on variance and remove features with low variance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          threshold:
+            type: float
+            description: "Threshold for variance."
+            default: 0.0
+        required:
+          - label_col
+    fit:
+      description: "Fit the VarianceBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
--- a/metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml
@ -0,0 +1,56 @@
+SplitBins:
+  type: class
+  description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be binned inplace."
+          strategy:
+            type: str
+            description: "Strategy used to define the widths of the bins."
+            default: quantile
+            enum:
+              - quantile
+              - uniform
+              - kmeans
+        required:
+          - cols
+    fit:
+      description: "Fit the SplitBins model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml
@ -0,0 +1,52 @@
+TargetMeanEncoder:
+  type: class
+  description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the TargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
--- a/metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml
@ -0,0 +1,56 @@
+TreeBasedSelection:
+  type: class
+  description: "Select features based on tree-based model and remove features with low importance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          task_type:
+            type: str
+            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
+            enum:
+              - cls
+              - mcls
+              - reg
+        required:
+          - label_col
+          - task_type
+    fit:
+      description: "Fit the TreeBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
--- a/metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml
+++ b/metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml
@ -0,0 +1,52 @@
+VarianceBasedSelection:
+  type: class
+  description: "Select features based on variance and remove features with low variance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          threshold:
+            type: float
+            description: "Threshold for variance."
+            default: 0.0
+        required:
+          - label_col
+    fit:
+      description: "Fit the VarianceBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
--- a/metagpt/tools/tool_registry.py
+++ b/metagpt/tools/tool_registry.py
@ -0,0 +1,128 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/01/12 17:07
+@Author  : garylin2099
+@File    : tool_registry.py
+"""
+import os
+from collections import defaultdict
+import inspect
+import re
+
+import yaml
+
+from metagpt.tools.tool_schema import ToolType, ToolSchema, Tool
+from metagpt.logs import logger
+from metagpt.const import TOOL_SCHEMA_PATH
+
+
+class ToolRegistry:
+    def __init__(self):
+        self.tools = {}
+        self.tool_types = {}
+        self.tools_by_types = defaultdict(
+            dict
+        )  # two-layer k-v, {tool_type_name: {tool_name: {...}, ...}, ...}
+
+    def register_tool_type(self, tool_type: ToolType):
+        self.tool_types[tool_type.name] = tool_type
+
+    def register_tool(
+        self,
+        tool_name,
+        tool_path,
+        schema_path=None,
+        tool_code="",
+        tool_type_name="other",
+        make_schema_if_not_exists=False,
+    ):
+        if self.has_tool(tool_name):
+            return
+
+        schema_path = schema_path or TOOL_SCHEMA_PATH / tool_type_name / f"{tool_name}.yml"
+
+        if not os.path.exists(schema_path):
+            if make_schema_if_not_exists:
+                logger.warning(f"no schema found, will make schema at {schema_path}")
+                make_schema(tool_code, schema_path)
+            else:
+                logger.warning(f"no schema found at assumed schema_path {schema_path}, skip registering {tool_name}")
+                return
+
+        with open(schema_path, "r", encoding="utf-8") as f:
+            schema = yaml.safe_load(f)[tool_name]
+        schema["tool_path"] = tool_path  # corresponding code file path of the tool
+        try:
+            ToolSchema(**schema)  # validation
+        except Exception as e:
+            pass
+            # logger.warning(
+            #     f"{tool_name} schema not conforms to required format, but will be used anyway. Mismatch: {e}"
+            # )
+        tool = Tool(name=tool_name, path=tool_path, schema=schema, code=tool_code)
+        self.tools[tool_name] = tool
+        self.tools_by_types[tool_type_name][tool_name] = tool
+        logger.info(f"{tool_name} registered")
+
+    def has_tool(self, key):
+        return key in self.tools
+    
+    def get_tool(self, key):
+        return self.tools.get(key)
+    
+    def get_tools_by_type(self, key):
+        return self.tools_by_types.get(key)
+    
+    def has_tool_type(self, key):
+        return key in self.tool_types
+
+    def get_tool_type(self, key):
+        return self.tool_types.get(key)
+    
+    def get_tool_types(self):
+        return self.tool_types
+
+
+# Registry instance
+TOOL_REGISTRY = ToolRegistry()
+
+
+def register_tool_type(cls):
+    """register a tool type to registry"""
+    TOOL_REGISTRY.register_tool_type(tool_type=cls())
+    return cls
+
+
+def register_tool(tool_name="", tool_type_name="other", schema_path=None):
+    """register a tool to registry"""
+
+    def decorator(cls, tool_name=tool_name):
+        tool_name = tool_name or cls.__name__
+        
+        # Get the file path where the function / class is defined and the source code
+        file_path = inspect.getfile(cls)
+        if "metagpt" in file_path:
+            file_path = re.search("metagpt.+", file_path).group(0)
+        source_code = inspect.getsource(cls)
+
+        TOOL_REGISTRY.register_tool(
+            tool_name=tool_name,
+            tool_path=file_path,
+            schema_path=schema_path,
+            tool_code=source_code,
+            tool_type_name=tool_type_name,
+        )
+        return cls
+
+    return decorator
+
+
+def make_schema(tool_code, path):
+    os.makedirs(
+        os.path.dirname(path), exist_ok=True
+    )  # Create the necessary directories
+    schema = {}  # an empty schema for now
+    with open(path, "w", encoding="utf-8") as f:
+        yaml.dump(schema, f)
+    return path
--- a/metagpt/tools/tool_schema.py
+++ b/metagpt/tools/tool_schema.py
@ -0,0 +1,31 @@
+from enum import Enum
+
+from pydantic import BaseModel
+
+
+class ToolTypeEnum(Enum):
+    DATA_PREPROCESS = "data_preprocess"
+    FEATURE_ENGINEERING = "feature_engineering"
+    MODEL_TRAIN = "model_train"
+    MODEL_EVALUATE = "model_evaluate"
+    OTHER = "other"
+
+    def __missing__(self, key):
+        return self.OTHER
+
+
+class ToolType(BaseModel):
+    name: str
+    desc: str
+    usage_prompt: str = ""
+
+
+class ToolSchema(BaseModel):
+    name: str
+
+
+class Tool(BaseModel):
+    name: str
+    path: str
+    schema: dict = {}
+    code: str = ""
--- a/metagpt/tools/tool_types.py
+++ b/metagpt/tools/tool_types.py
@ -0,0 +1,43 @@
+from metagpt.prompts.tool_type import (
+    DATA_PREPROCESS_PROMPT,
+    FEATURE_ENGINEERING_PROMPT,
+    MODEL_TRAIN_PROMPT,
+    MODEL_EVALUATE_PROMPT,
+)
+from metagpt.tools.tool_schema import ToolTypeEnum, ToolType
+from metagpt.tools.tool_registry import register_tool_type
+
+
+@register_tool_type
+class DataPreprocess(ToolType):
+    name: str = ToolTypeEnum.DATA_PREPROCESS.value
+    desc: str = "Only for changing value inplace."
+    usage_prompt: str = DATA_PREPROCESS_PROMPT
+
+
+@register_tool_type
+class FeatureEngineer(ToolType):
+    name: str = ToolTypeEnum.FEATURE_ENGINEERING.value
+    desc: str = "Only for creating new columns for input data."
+    usage_prompt: str = FEATURE_ENGINEERING_PROMPT
+
+
+@register_tool_type
+class ModelTrain(ToolType):
+    name: str = ToolTypeEnum.MODEL_TRAIN.value
+    desc: str = "Only for training model."
+    usage_prompt: str = MODEL_TRAIN_PROMPT
+
+
+@register_tool_type
+class ModelEvaluate(ToolType):
+    name: str = ToolTypeEnum.MODEL_EVALUATE.value
+    desc: str = "Only for evaluating model."
+    usage_prompt: str = MODEL_EVALUATE_PROMPT
+
+
+@register_tool_type
+class Other(ToolType):
+    name: str = ToolTypeEnum.OTHER.value
+    desc: str = "Any tools not in the defined categories"
+    usage_prompt: str = ""