From 46cd219e817eae2abf6d5a8b552bebf531672526 Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Sat, 13 Jan 2024 01:28:49 +0800
Subject: [PATCH] add tool registry

---
 metagpt/actions/write_analysis_code.py        |  58 +-
 metagpt/actions/write_plan.py                 |   7 +-
 metagpt/prompts/ml_engineer.py                |   2 +-
 metagpt/tools/__init__.py                     |  11 -
 .../tools/functions/libs/data_preprocess.py   |  13 +
 .../functions/libs/feature_engineering.py     |  17 +-
 .../data_preprocess/FillMissingValue.yml      |  61 ++
 .../schemas/data_preprocess/LabelEncode.yml   |  48 ++
 .../schemas/data_preprocess/MaxAbsScale.yml   |  48 ++
 .../schemas/data_preprocess/MinMaxScale.yml   |  48 ++
 .../schemas/data_preprocess/OneHotEncode.yml  |  48 ++
 .../schemas/data_preprocess/StandardScale.yml |  48 ++
 .../schemas/feature_engineering/CatCount.yml  |  48 ++
 .../schemas/feature_engineering/CatCross.yml  |  52 ++
 .../feature_engineering/GeneralSelection.yml  |  48 ++
 .../schemas/feature_engineering/GroupStat.yml |  58 ++
 .../KFoldTargetMeanEncoder.yml                |  60 ++
 .../PolynomialExpansion.yml                   | 548 ++++++++++++++++++
 .../schemas/feature_engineering/SplitBins.yml |  56 ++
 .../feature_engineering/TargetMeanEncoder.yml |  52 ++
 .../TreeBasedSelection.yml                    |  56 ++
 .../VarianceBasedSelection.yml                |  52 ++
 metagpt/tools/tool_registry.py                | 128 ++++
 metagpt/tools/tool_schema.py                  |  31 +
 metagpt/tools/tool_types.py                   |  43 ++
 25 files changed, 1582 insertions(+), 59 deletions(-)
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/CatCount.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/CatCross.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml
 create mode 100644 metagpt/tools/tool_registry.py
 create mode 100644 metagpt/tools/tool_schema.py
 create mode 100644 metagpt/tools/tool_types.py

diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py
index 9104fdf82..f4ae1e572 100644
--- a/metagpt/actions/write_analysis_code.py
+++ b/metagpt/actions/write_analysis_code.py
@@ -8,11 +8,9 @@ import re
 from pathlib import Path
 from typing import Dict, List, Tuple, Union
 
-import yaml
 from tenacity import retry, stop_after_attempt, wait_fixed
 
 from metagpt.actions import Action
-from metagpt.const import TOOL_SCHEMA_PATH
 from metagpt.llm import LLM
 from metagpt.logs import logger
 from metagpt.prompts.ml_engineer import (
@@ -24,12 +22,9 @@ from metagpt.prompts.ml_engineer import (
     TOOL_USAGE_PROMPT,
 )
 from metagpt.schema import Message, Plan
-from metagpt.tools import TOOL_TYPE_MAPPINGS
+from metagpt.tools.tool_registry import TOOL_REGISTRY
 from metagpt.utils.common import create_func_config, remove_comments
 
-TOOL_TYPE_MODULE = {k: v.module for k, v in TOOL_TYPE_MAPPINGS.items()}
-TOOL_TYPE_USAGE_PROMPT = {k: v.usage_prompt for k, v in TOOL_TYPE_MAPPINGS.items()}
-
 
 class BaseWriteAnalysisCode(Action):
     DEFAULT_SYSTEM_MSG: str = """You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**"""  # prompt reference: https://github.com/KillianLucas/open-interpreter/blob/v0.1.4/interpreter/system_message.txt
@@ -95,49 +90,27 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):
 class WriteCodeWithTools(BaseWriteAnalysisCode):
     """Write code with help of local available tools. Choose tools first, then generate code to use the tools"""
 
-    schema_path: Union[Path, str] = TOOL_SCHEMA_PATH
     available_tools: dict = {}
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self._load_tools(self.schema_path)
 
-    def _load_tools(self, schema_path, schema_module=None):
-        """Load tools from yaml file"""
-        if isinstance(schema_path, dict):
-            schema_module = schema_module or "udf"
-            self.available_tools.update({schema_module: schema_path})
-        else:
-            if isinstance(schema_path, list):
-                yml_files = schema_path
-            elif isinstance(schema_path, Path) and schema_path.is_file():
-                yml_files = [schema_path]
-            else:
-                yml_files = schema_path.glob("*.yml")
-
-            for yml_file in yml_files:
-                module = yml_file.stem
-                with open(yml_file, "r", encoding="utf-8") as f:
-                    self.available_tools[module] = yaml.safe_load(f)
-
-    def _parse_recommend_tools(self, module: str, recommend_tools: list) -> dict:
+    def _parse_recommend_tools(self, recommend_tools: list) -> dict:
         """
         Parses and validates a list of recommended tools, and retrieves their schema from registry.
 
         Args:
-            module (str): The module name for querying tools in the registry.
             recommend_tools (list): A list of recommended tools.
 
         Returns:
             dict: A dict of valid tool schemas.
         """
         valid_tools = []
-        available_tools = self.available_tools[module].keys()
-        for tool in recommend_tools:
-            if tool in available_tools:
-                valid_tools.append(tool)
+        for tool_name in recommend_tools:
+            if TOOL_REGISTRY.has_tool(tool_name):
+                valid_tools.append(TOOL_REGISTRY.get_tool(tool_name))
 
-        tool_catalog = {tool: self.available_tools[module][tool] for tool in valid_tools}
+        tool_catalog = {tool.name: tool.schema for tool in valid_tools}
         return tool_catalog
 
     async def _tool_recommendation(
@@ -176,8 +149,10 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
         tool_type = (
             plan.current_task.task_type
         )  # find tool type from task type through exact match, can extend to retrieval in the future
-        available_tools = self.available_tools.get(tool_type, {})
-        special_prompt = TOOL_TYPE_USAGE_PROMPT.get(tool_type, "")
+        available_tools = TOOL_REGISTRY.get_tools_by_type(tool_type)
+        special_prompt = (
+            TOOL_REGISTRY.get_tool_type(tool_type).usage_prompt if TOOL_REGISTRY.has_tool_type(tool_type) else ""
+        )
         code_steps = plan.current_task.code_steps
 
         finished_tasks = plan.get_finished_tasks()
@@ -185,22 +160,17 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
         code_context = "\n\n".join(code_context)
 
         tool_catalog = {}
-        module_name = ""
 
-        if len(available_tools) > 0:
-            available_tools = {k: v["description"] for k, v in available_tools.items()}
+        if available_tools:
+            available_tools = {tool_name: tool.schema["description"] for tool_name, tool in available_tools.items()}
 
             recommend_tools = await self._tool_recommendation(
                 plan.current_task.instruction, code_steps, available_tools
             )
-            tool_catalog = self._parse_recommend_tools(tool_type, recommend_tools)
+            tool_catalog = self._parse_recommend_tools(recommend_tools)
             logger.info(f"Recommended tools: \n{recommend_tools}")
 
-            module_name = TOOL_TYPE_MODULE[tool_type]
-
-        tools_instruction = TOOL_USAGE_PROMPT.format(
-            special_prompt=special_prompt, module_name=module_name, tool_catalog=tool_catalog
-        )
+        tools_instruction = TOOL_USAGE_PROMPT.format(special_prompt=special_prompt, tool_catalog=tool_catalog)
 
         context.append(Message(content=tools_instruction, role="user"))
 
diff --git a/metagpt/actions/write_plan.py b/metagpt/actions/write_plan.py
index c7ef541b9..60dcef43b 100644
--- a/metagpt/actions/write_plan.py
+++ b/metagpt/actions/write_plan.py
@@ -12,7 +12,7 @@ from metagpt.actions import Action
 from metagpt.logs import logger
 from metagpt.prompts.ml_engineer import ASSIGN_TASK_TYPE_CONFIG, ASSIGN_TASK_TYPE_PROMPT
 from metagpt.schema import Message, Plan, Task
-from metagpt.tools import TOOL_TYPE_MAPPINGS
+from metagpt.tools import TOOL_REGISTRY
 from metagpt.utils.common import CodeParser, create_func_config
 
 
@@ -47,13 +47,16 @@ class WritePlan(Action):
             List[Dict]: tasks with task type assigned
         """
         task_list = "\n".join([f"Task {task['task_id']}: {task['instruction']}" for task in tasks])
-        task_type_desc = "\n".join([f"- **{item.name}**: {item.desc}" for item in TOOL_TYPE_MAPPINGS.values()])
+        task_type_desc = "\n".join(
+            [f"- **{tool_type.name}**: {tool_type.desc}" for tool_type in TOOL_REGISTRY.get_tool_types().values()]
+        )  # task type are binded with tool type now, should be improved in the future
         prompt = ASSIGN_TASK_TYPE_PROMPT.format(
             task_list=task_list, task_type_desc=task_type_desc
         )  # task types are set to be the same as tool types, for now
         tool_config = create_func_config(ASSIGN_TASK_TYPE_CONFIG)
         rsp = await self.llm.aask_code(prompt, **tool_config)
         task_type_list = rsp["task_type"]
+        print(f"assigned task types: {task_type_list}")
         for task, task_type in zip(tasks, task_type_list):
             task["task_type"] = task_type
         return json.dumps(tasks)
diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py
index 3baf79843..31d754a9e 100644
--- a/metagpt/prompts/ml_engineer.py
+++ b/metagpt/prompts/ml_engineer.py
@@ -203,7 +203,7 @@ Specifically, {special_prompt}
 - You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..
 
 # Available Tools (can be empty):
-Each Class tool is described in JSON format. When you call a tool, import the tool from `{module_name}` first.
+Each Class tool is described in JSON format. When you call a tool, import the tool first.
 {tool_catalog}
 
 # Constraints:
diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py
index 222edf312..f743d63c7 100644
--- a/metagpt/tools/__init__.py
+++ b/metagpt/tools/__init__.py
@@ -8,17 +8,6 @@
 
 from enum import Enum
 
-from pydantic import BaseModel
-
-from metagpt.const import TOOL_LIBS_PATH
-from metagpt.prompts.tool_type import (
-    DATA_PREPROCESS_PROMPT,
-    FEATURE_ENGINEERING_PROMPT,
-    MODEL_TRAIN_PROMPT,
-    MODEL_EVALUATE_PROMPT,
-    VISION_PROMPT,
-)
-
 
 class SearchEngineType(Enum):
     SERPAPI_GOOGLE = "serpapi"
diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py
index f423f2020..59ede3ffc 100644
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@@ -14,8 +14,13 @@ from sklearn.preprocessing import (
 )
 
 from metagpt.tools.functions.libs.base import MLProcess
+from metagpt.tools.tool_registry import register_tool
+from metagpt.tools.tool_schema import ToolTypeEnum
+
+TOOL_TYPE = ToolTypeEnum.DATA_PREPROCESS.value
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class FillMissingValue(MLProcess):
     def __init__(
         self,
@@ -42,6 +47,7 @@ class FillMissingValue(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class MinMaxScale(MLProcess):
     def __init__(
         self,
@@ -60,6 +66,7 @@ class MinMaxScale(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class StandardScale(MLProcess):
     def __init__(
         self,
@@ -78,6 +85,7 @@ class StandardScale(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class MaxAbsScale(MLProcess):
     def __init__(
         self,
@@ -96,6 +104,7 @@ class MaxAbsScale(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class RobustScale(MLProcess):
     def __init__(
         self,
@@ -114,6 +123,7 @@ class RobustScale(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class OrdinalEncode(MLProcess):
     def __init__(
         self,
@@ -132,6 +142,7 @@ class OrdinalEncode(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class OneHotEncode(MLProcess):
     def __init__(
         self,
@@ -153,6 +164,7 @@ class OneHotEncode(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class LabelEncode(MLProcess):
     def __init__(
         self,
@@ -181,6 +193,7 @@ class LabelEncode(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 def get_column_info(df: pd.DataFrame) -> dict:
     column_info = {
         "Category": [],
diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py
index 0d9584b4a..8b96cbd07 100644
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@@ -6,7 +6,7 @@
 # @Desc    : Feature Engineering Tools
 import itertools
 
-import lightgbm as lgb
+# import lightgbm as lgb
 import numpy as np
 import pandas as pd
 from joblib import Parallel, delayed
@@ -16,8 +16,13 @@ from sklearn.model_selection import KFold
 from sklearn.preprocessing import KBinsDiscretizer, PolynomialFeatures
 
 from metagpt.tools.functions.libs.base import MLProcess
+from metagpt.tools.tool_registry import register_tool
+from metagpt.tools.tool_schema import ToolTypeEnum
+
+TOOL_TYPE = ToolTypeEnum.FEATURE_ENGINEERING.value
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class PolynomialExpansion(MLProcess):
     def __init__(self, cols: list, degree: int = 2, label_col: str = None):
         self.cols = cols
@@ -48,6 +53,7 @@ class PolynomialExpansion(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class CatCount(MLProcess):
     def __init__(self, col: str):
         self.col = col
@@ -62,6 +68,7 @@ class CatCount(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class TargetMeanEncoder(MLProcess):
     def __init__(self, col: str, label: str):
         self.col = col
@@ -77,6 +84,7 @@ class TargetMeanEncoder(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class KFoldTargetMeanEncoder(MLProcess):
     def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
         self.col = col
@@ -103,6 +111,7 @@ class KFoldTargetMeanEncoder(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class CatCross(MLProcess):
     def __init__(self, cols: list, max_cat_num: int = 100):
         self.cols = cols
@@ -138,6 +147,7 @@ class CatCross(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class GroupStat(MLProcess):
     def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
         self.group_col = group_col
@@ -157,6 +167,7 @@ class GroupStat(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class SplitBins(MLProcess):
     def __init__(self, cols: list, strategy: str = "quantile"):
         self.cols = cols
@@ -173,6 +184,7 @@ class SplitBins(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class ExtractTimeComps(MLProcess):
     def __init__(self, time_col: str, time_comps: list):
         self.time_col = time_col
@@ -201,6 +213,7 @@ class ExtractTimeComps(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class GeneralSelection(MLProcess):
     def __init__(self, label_col: str):
         self.label_col = label_col
@@ -228,6 +241,7 @@ class GeneralSelection(MLProcess):
         return new_df
 
 
+# skip for now because lgb is needed
 class TreeBasedSelection(MLProcess):
     def __init__(self, label_col: str, task_type: str):
         self.label_col = label_col
@@ -270,6 +284,7 @@ class TreeBasedSelection(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class VarianceBasedSelection(MLProcess):
     def __init__(self, label_col: str, threshold: float = 0):
         self.label_col = label_col
diff --git a/metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml b/metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml
new file mode 100644
index 000000000..44c830a1e
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml
@@ -0,0 +1,61 @@
+FillMissingValue:
+  type: class
+  description: "Completing missing values with simple strategies"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+          strategy:
+            type: str
+            description: "the imputation strategy, notice mean/median can only be used for numeric features"
+            default: mean
+            enum:
+              - mean
+              - median
+              - most_frequent
+              - constant
+          fill_value:
+            type: int
+            description: "fill_value is used to replace all occurrences of missing_values"
+            default: null
+        required:
+          - features
+    fit:
+      description: "Fit the FillMissingValue model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml b/metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml
new file mode 100644
index 000000000..419ef60a8
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml
@@ -0,0 +1,48 @@
+LabelEncode:
+  type: class
+  description: "Apply label encoding to specified categorical columns in-place."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "Categorical columns to be label encoded"
+        required:
+          - features
+    fit:
+      description: "Fit the LabelEncode model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml b/metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml
new file mode 100644
index 000000000..3e17cfdd0
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml
@@ -0,0 +1,48 @@
+MaxAbsScale:
+  type: class
+  description: "cale each feature by its maximum absolute value"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the MaxAbsScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml b/metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml
new file mode 100644
index 000000000..8f050d942
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml
@@ -0,0 +1,48 @@
+MinMaxScale:
+  type: class
+  description: "Transform features by scaling each feature to a range, witch is (0, 1)"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the MinMaxScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml b/metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml
new file mode 100644
index 000000000..f499b2cb8
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml
@@ -0,0 +1,48 @@
+OneHotEncode:
+  type: class
+  description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "Categorical columns to be one-hot encoded and dropped"
+        required:
+          - features
+    fit:
+      description: "Fit the OneHotEncoding model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml b/metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml
new file mode 100644
index 000000000..cf6e7d57b
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml
@@ -0,0 +1,48 @@
+StandardScale:
+  type: class
+  description: "Standardize features by removing the mean and scaling to unit variance"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the StandardScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/feature_engineering/CatCount.yml b/metagpt/tools/functions/schemas/feature_engineering/CatCount.yml
new file mode 100644
index 000000000..049fc7879
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/CatCount.yml
@@ -0,0 +1,48 @@
+CatCount:
+  type: class
+  description: "Add value counts of a categorical column as new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column for value counts."
+        required:
+          - col
+    fit:
+      description: "Fit the CatCount model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/CatCross.yml b/metagpt/tools/functions/schemas/feature_engineering/CatCross.yml
new file mode 100644
index 000000000..5d6303439
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/CatCross.yml
@@ -0,0 +1,52 @@
+CatCross:
+  type: class
+  description: "Add pairwise crossed features and convert them to numerical features."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be pairwise crossed, at least 2 columns."
+          max_cat_num:
+            type: int
+            description: "Maximum unique categories per crossed feature."
+            default: 100
+      required:
+        - cols
+    fit:
+      description: "Fit the CatCross model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml b/metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml
new file mode 100644
index 000000000..2ebf5b397
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml
@@ -0,0 +1,48 @@
+GeneralSelection:
+  type: class
+  description: "Drop all nan feats and feats with only one unique value."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+        required:
+          - label_col
+    fit:
+      description: "Fit the GeneralSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml b/metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml
new file mode 100644
index 000000000..6e0ba2877
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml
@@ -0,0 +1,58 @@
+GroupStat:
+  type: class
+  description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          group_col:
+            type: str
+            description: "Column used for grouping."
+          agg_col:
+            type: str
+            description: "Column on which aggregation is performed."
+          agg_funcs:
+            type: list
+            description: >-
+              List of aggregation functions to apply, such as ['mean', 'std'].
+              Each function must be supported by pandas.
+        required:
+          - group_col
+          - agg_col
+          - agg_funcs
+    fit:
+      description: "Fit the GroupStat model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml b/metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
new file mode 100644
index 000000000..79a673f9f
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
@@ -0,0 +1,60 @@
+KFoldTargetMeanEncoder:
+  type: class
+  description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be k-fold mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+          n_splits:
+            type: int
+            description: "Number of splits for K-fold."
+            default: 5
+          random_state:
+            type: int
+            description: "Random seed."
+            default: 2021
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the KFoldTargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml b/metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml
new file mode 100644
index 000000000..62e6ad5b3
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml
@@ -0,0 +1,548 @@
+PolynomialExpansion:
+  type: class
+  description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns for polynomial expansion."
+          label_col:
+            type: str
+            description: "Label column name."
+          degree:
+            type: int
+            description: "The degree of the polynomial features."
+            default: 2
+        required:
+          - cols
+          - label_col
+    fit:
+      description: "Fit the PolynomialExpansion model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame without duplicated columns."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame without duplicated columns."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+CatCount:
+  type: class
+  description: "Add value counts of a categorical column as new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column for value counts."
+        required:
+          - col
+    fit:
+      description: "Fit the CatCount model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+TargetMeanEncoder:
+  type: class
+  description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the TargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+KFoldTargetMeanEncoder:
+  type: class
+  description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be k-fold mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+          n_splits:
+            type: int
+            description: "Number of splits for K-fold."
+            default: 5
+          random_state:
+            type: int
+            description: "Random seed."
+            default: 2021
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the KFoldTargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+CatCross:
+  type: class
+  description: "Add pairwise crossed features and convert them to numerical features."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be pairwise crossed, at least 2 columns."
+          max_cat_num:
+            type: int
+            description: "Maximum unique categories per crossed feature."
+            default: 100
+      required:
+        - cols
+    fit:
+      description: "Fit the CatCross model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+GroupStat:
+  type: class
+  description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          group_col:
+            type: str
+            description: "Column used for grouping."
+          agg_col:
+            type: str
+            description: "Column on which aggregation is performed."
+          agg_funcs:
+            type: list
+            description: >-
+              List of aggregation functions to apply, such as ['mean', 'std'].
+              Each function must be supported by pandas.
+        required:
+          - group_col
+          - agg_col
+          - agg_funcs
+    fit:
+      description: "Fit the GroupStat model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+SplitBins:
+  type: class
+  description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be binned inplace."
+          strategy:
+            type: str
+            description: "Strategy used to define the widths of the bins."
+            default: quantile
+            enum:
+              - quantile
+              - uniform
+              - kmeans
+        required:
+          - cols
+    fit:
+      description: "Fit the SplitBins model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+GeneralSelection:
+  type: class
+  description: "Drop all nan feats and feats with only one unique value."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+        required:
+          - label_col
+    fit:
+      description: "Fit the GeneralSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+
+TreeBasedSelection:
+  type: class
+  description: "Select features based on tree-based model and remove features with low importance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          task_type:
+            type: str
+            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
+            enum:
+              - cls
+              - mcls
+              - reg
+        required:
+          - label_col
+          - task_type
+    fit:
+      description: "Fit the TreeBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+
+VarianceBasedSelection:
+  type: class
+  description: "Select features based on variance and remove features with low variance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          threshold:
+            type: float
+            description: "Threshold for variance."
+            default: 0.0
+        required:
+          - label_col
+    fit:
+      description: "Fit the VarianceBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml b/metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml
new file mode 100644
index 000000000..4e0171406
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml
@@ -0,0 +1,56 @@
+SplitBins:
+  type: class
+  description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be binned inplace."
+          strategy:
+            type: str
+            description: "Strategy used to define the widths of the bins."
+            default: quantile
+            enum:
+              - quantile
+              - uniform
+              - kmeans
+        required:
+          - cols
+    fit:
+      description: "Fit the SplitBins model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml b/metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml
new file mode 100644
index 000000000..86416ccbb
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml
@@ -0,0 +1,52 @@
+TargetMeanEncoder:
+  type: class
+  description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the TargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml b/metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml
new file mode 100644
index 000000000..c210effea
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml
@@ -0,0 +1,56 @@
+TreeBasedSelection:
+  type: class
+  description: "Select features based on tree-based model and remove features with low importance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          task_type:
+            type: str
+            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
+            enum:
+              - cls
+              - mcls
+              - reg
+        required:
+          - label_col
+          - task_type
+    fit:
+      description: "Fit the TreeBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml b/metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml
new file mode 100644
index 000000000..6da4c3e7f
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml
@@ -0,0 +1,52 @@
+VarianceBasedSelection:
+  type: class
+  description: "Select features based on variance and remove features with low variance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          threshold:
+            type: float
+            description: "Threshold for variance."
+            default: 0.0
+        required:
+          - label_col
+    fit:
+      description: "Fit the VarianceBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
\ No newline at end of file
diff --git a/metagpt/tools/tool_registry.py b/metagpt/tools/tool_registry.py
new file mode 100644
index 000000000..201c63c71
--- /dev/null
+++ b/metagpt/tools/tool_registry.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/01/12 17:07
+@Author  : garylin2099
+@File    : tool_registry.py
+"""
+import os
+from collections import defaultdict
+import inspect
+import re
+
+import yaml
+
+from metagpt.tools.tool_schema import ToolType, ToolSchema, Tool
+from metagpt.logs import logger
+from metagpt.const import TOOL_SCHEMA_PATH
+
+
+class ToolRegistry:
+    def __init__(self):
+        self.tools = {}
+        self.tool_types = {}
+        self.tools_by_types = defaultdict(
+            dict
+        )  # two-layer k-v, {tool_type_name: {tool_name: {...}, ...}, ...}
+
+    def register_tool_type(self, tool_type: ToolType):
+        self.tool_types[tool_type.name] = tool_type
+
+    def register_tool(
+        self,
+        tool_name,
+        tool_path,
+        schema_path=None,
+        tool_code="",
+        tool_type_name="other",
+        make_schema_if_not_exists=False,
+    ):
+        if self.has_tool(tool_name):
+            return
+
+        schema_path = schema_path or TOOL_SCHEMA_PATH / tool_type_name / f"{tool_name}.yml"
+
+        if not os.path.exists(schema_path):
+            if make_schema_if_not_exists:
+                logger.warning(f"no schema found, will make schema at {schema_path}")
+                make_schema(tool_code, schema_path)
+            else:
+                logger.warning(f"no schema found at assumed schema_path {schema_path}, skip registering {tool_name}")
+                return
+
+        with open(schema_path, "r", encoding="utf-8") as f:
+            schema = yaml.safe_load(f)[tool_name]
+        schema["tool_path"] = tool_path  # corresponding code file path of the tool
+        try:
+            ToolSchema(**schema)  # validation
+        except Exception as e:
+            pass
+            # logger.warning(
+            #     f"{tool_name} schema not conforms to required format, but will be used anyway. Mismatch: {e}"
+            # )
+        tool = Tool(name=tool_name, path=tool_path, schema=schema, code=tool_code)
+        self.tools[tool_name] = tool
+        self.tools_by_types[tool_type_name][tool_name] = tool
+        logger.info(f"{tool_name} registered")
+
+    def has_tool(self, key):
+        return key in self.tools
+    
+    def get_tool(self, key):
+        return self.tools.get(key)
+    
+    def get_tools_by_type(self, key):
+        return self.tools_by_types.get(key)
+    
+    def has_tool_type(self, key):
+        return key in self.tool_types
+
+    def get_tool_type(self, key):
+        return self.tool_types.get(key)
+    
+    def get_tool_types(self):
+        return self.tool_types
+
+
+# Registry instance
+TOOL_REGISTRY = ToolRegistry()
+
+
+def register_tool_type(cls):
+    """register a tool type to registry"""
+    TOOL_REGISTRY.register_tool_type(tool_type=cls())
+    return cls
+
+
+def register_tool(tool_name="", tool_type_name="other", schema_path=None):
+    """register a tool to registry"""
+
+    def decorator(cls, tool_name=tool_name):
+        tool_name = tool_name or cls.__name__
+        
+        # Get the file path where the function / class is defined and the source code
+        file_path = inspect.getfile(cls)
+        if "metagpt" in file_path:
+            file_path = re.search("metagpt.+", file_path).group(0)
+        source_code = inspect.getsource(cls)
+
+        TOOL_REGISTRY.register_tool(
+            tool_name=tool_name,
+            tool_path=file_path,
+            schema_path=schema_path,
+            tool_code=source_code,
+            tool_type_name=tool_type_name,
+        )
+        return cls
+
+    return decorator
+
+
+def make_schema(tool_code, path):
+    os.makedirs(
+        os.path.dirname(path), exist_ok=True
+    )  # Create the necessary directories
+    schema = {}  # an empty schema for now
+    with open(path, "w", encoding="utf-8") as f:
+        yaml.dump(schema, f)
+    return path
diff --git a/metagpt/tools/tool_schema.py b/metagpt/tools/tool_schema.py
new file mode 100644
index 000000000..2b90996e5
--- /dev/null
+++ b/metagpt/tools/tool_schema.py
@@ -0,0 +1,31 @@
+from enum import Enum
+
+from pydantic import BaseModel
+
+
+class ToolTypeEnum(Enum):
+    DATA_PREPROCESS = "data_preprocess"
+    FEATURE_ENGINEERING = "feature_engineering"
+    MODEL_TRAIN = "model_train"
+    MODEL_EVALUATE = "model_evaluate"
+    OTHER = "other"
+
+    def __missing__(self, key):
+        return self.OTHER
+
+
+class ToolType(BaseModel):
+    name: str
+    desc: str
+    usage_prompt: str = ""
+
+
+class ToolSchema(BaseModel):
+    name: str
+
+
+class Tool(BaseModel):
+    name: str
+    path: str
+    schema: dict = {}
+    code: str = ""
diff --git a/metagpt/tools/tool_types.py b/metagpt/tools/tool_types.py
new file mode 100644
index 000000000..9104f90b8
--- /dev/null
+++ b/metagpt/tools/tool_types.py
@@ -0,0 +1,43 @@
+from metagpt.prompts.tool_type import (
+    DATA_PREPROCESS_PROMPT,
+    FEATURE_ENGINEERING_PROMPT,
+    MODEL_TRAIN_PROMPT,
+    MODEL_EVALUATE_PROMPT,
+)
+from metagpt.tools.tool_schema import ToolTypeEnum, ToolType
+from metagpt.tools.tool_registry import register_tool_type
+
+
+@register_tool_type
+class DataPreprocess(ToolType):
+    name: str = ToolTypeEnum.DATA_PREPROCESS.value
+    desc: str = "Only for changing value inplace."
+    usage_prompt: str = DATA_PREPROCESS_PROMPT
+
+
+@register_tool_type
+class FeatureEngineer(ToolType):
+    name: str = ToolTypeEnum.FEATURE_ENGINEERING.value
+    desc: str = "Only for creating new columns for input data."
+    usage_prompt: str = FEATURE_ENGINEERING_PROMPT
+
+
+@register_tool_type
+class ModelTrain(ToolType):
+    name: str = ToolTypeEnum.MODEL_TRAIN.value
+    desc: str = "Only for training model."
+    usage_prompt: str = MODEL_TRAIN_PROMPT
+
+
+@register_tool_type
+class ModelEvaluate(ToolType):
+    name: str = ToolTypeEnum.MODEL_EVALUATE.value
+    desc: str = "Only for evaluating model."
+    usage_prompt: str = MODEL_EVALUATE_PROMPT
+
+
+@register_tool_type
+class Other(ToolType):
+    name: str = ToolTypeEnum.OTHER.value
+    desc: str = "Any tools not in the defined categories"
+    usage_prompt: str = ""