From 46cd219e817eae2abf6d5a8b552bebf531672526 Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Sat, 13 Jan 2024 01:28:49 +0800
Subject: [PATCH 01/12] add tool registry

---
 metagpt/actions/write_analysis_code.py        |  58 +-
 metagpt/actions/write_plan.py                 |   7 +-
 metagpt/prompts/ml_engineer.py                |   2 +-
 metagpt/tools/__init__.py                     |  11 -
 .../tools/functions/libs/data_preprocess.py   |  13 +
 .../functions/libs/feature_engineering.py     |  17 +-
 .../data_preprocess/FillMissingValue.yml      |  61 ++
 .../schemas/data_preprocess/LabelEncode.yml   |  48 ++
 .../schemas/data_preprocess/MaxAbsScale.yml   |  48 ++
 .../schemas/data_preprocess/MinMaxScale.yml   |  48 ++
 .../schemas/data_preprocess/OneHotEncode.yml  |  48 ++
 .../schemas/data_preprocess/StandardScale.yml |  48 ++
 .../schemas/feature_engineering/CatCount.yml  |  48 ++
 .../schemas/feature_engineering/CatCross.yml  |  52 ++
 .../feature_engineering/GeneralSelection.yml  |  48 ++
 .../schemas/feature_engineering/GroupStat.yml |  58 ++
 .../KFoldTargetMeanEncoder.yml                |  60 ++
 .../PolynomialExpansion.yml                   | 548 ++++++++++++++++++
 .../schemas/feature_engineering/SplitBins.yml |  56 ++
 .../feature_engineering/TargetMeanEncoder.yml |  52 ++
 .../TreeBasedSelection.yml                    |  56 ++
 .../VarianceBasedSelection.yml                |  52 ++
 metagpt/tools/tool_registry.py                | 128 ++++
 metagpt/tools/tool_schema.py                  |  31 +
 metagpt/tools/tool_types.py                   |  43 ++
 25 files changed, 1582 insertions(+), 59 deletions(-)
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/CatCount.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/CatCross.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml
 create mode 100644 metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml
 create mode 100644 metagpt/tools/tool_registry.py
 create mode 100644 metagpt/tools/tool_schema.py
 create mode 100644 metagpt/tools/tool_types.py

diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py
index 9104fdf82..f4ae1e572 100644
--- a/metagpt/actions/write_analysis_code.py
+++ b/metagpt/actions/write_analysis_code.py
@@ -8,11 +8,9 @@ import re
 from pathlib import Path
 from typing import Dict, List, Tuple, Union
 
-import yaml
 from tenacity import retry, stop_after_attempt, wait_fixed
 
 from metagpt.actions import Action
-from metagpt.const import TOOL_SCHEMA_PATH
 from metagpt.llm import LLM
 from metagpt.logs import logger
 from metagpt.prompts.ml_engineer import (
@@ -24,12 +22,9 @@ from metagpt.prompts.ml_engineer import (
     TOOL_USAGE_PROMPT,
 )
 from metagpt.schema import Message, Plan
-from metagpt.tools import TOOL_TYPE_MAPPINGS
+from metagpt.tools.tool_registry import TOOL_REGISTRY
 from metagpt.utils.common import create_func_config, remove_comments
 
-TOOL_TYPE_MODULE = {k: v.module for k, v in TOOL_TYPE_MAPPINGS.items()}
-TOOL_TYPE_USAGE_PROMPT = {k: v.usage_prompt for k, v in TOOL_TYPE_MAPPINGS.items()}
-
 
 class BaseWriteAnalysisCode(Action):
     DEFAULT_SYSTEM_MSG: str = """You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**"""  # prompt reference: https://github.com/KillianLucas/open-interpreter/blob/v0.1.4/interpreter/system_message.txt
@@ -95,49 +90,27 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):
 class WriteCodeWithTools(BaseWriteAnalysisCode):
     """Write code with help of local available tools. Choose tools first, then generate code to use the tools"""
 
-    schema_path: Union[Path, str] = TOOL_SCHEMA_PATH
     available_tools: dict = {}
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self._load_tools(self.schema_path)
 
-    def _load_tools(self, schema_path, schema_module=None):
-        """Load tools from yaml file"""
-        if isinstance(schema_path, dict):
-            schema_module = schema_module or "udf"
-            self.available_tools.update({schema_module: schema_path})
-        else:
-            if isinstance(schema_path, list):
-                yml_files = schema_path
-            elif isinstance(schema_path, Path) and schema_path.is_file():
-                yml_files = [schema_path]
-            else:
-                yml_files = schema_path.glob("*.yml")
-
-            for yml_file in yml_files:
-                module = yml_file.stem
-                with open(yml_file, "r", encoding="utf-8") as f:
-                    self.available_tools[module] = yaml.safe_load(f)
-
-    def _parse_recommend_tools(self, module: str, recommend_tools: list) -> dict:
+    def _parse_recommend_tools(self, recommend_tools: list) -> dict:
         """
         Parses and validates a list of recommended tools, and retrieves their schema from registry.
 
         Args:
-            module (str): The module name for querying tools in the registry.
             recommend_tools (list): A list of recommended tools.
 
         Returns:
             dict: A dict of valid tool schemas.
         """
         valid_tools = []
-        available_tools = self.available_tools[module].keys()
-        for tool in recommend_tools:
-            if tool in available_tools:
-                valid_tools.append(tool)
+        for tool_name in recommend_tools:
+            if TOOL_REGISTRY.has_tool(tool_name):
+                valid_tools.append(TOOL_REGISTRY.get_tool(tool_name))
 
-        tool_catalog = {tool: self.available_tools[module][tool] for tool in valid_tools}
+        tool_catalog = {tool.name: tool.schema for tool in valid_tools}
         return tool_catalog
 
     async def _tool_recommendation(
@@ -176,8 +149,10 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
         tool_type = (
             plan.current_task.task_type
         )  # find tool type from task type through exact match, can extend to retrieval in the future
-        available_tools = self.available_tools.get(tool_type, {})
-        special_prompt = TOOL_TYPE_USAGE_PROMPT.get(tool_type, "")
+        available_tools = TOOL_REGISTRY.get_tools_by_type(tool_type)
+        special_prompt = (
+            TOOL_REGISTRY.get_tool_type(tool_type).usage_prompt if TOOL_REGISTRY.has_tool_type(tool_type) else ""
+        )
         code_steps = plan.current_task.code_steps
 
         finished_tasks = plan.get_finished_tasks()
@@ -185,22 +160,17 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
         code_context = "\n\n".join(code_context)
 
         tool_catalog = {}
-        module_name = ""
 
-        if len(available_tools) > 0:
-            available_tools = {k: v["description"] for k, v in available_tools.items()}
+        if available_tools:
+            available_tools = {tool_name: tool.schema["description"] for tool_name, tool in available_tools.items()}
 
             recommend_tools = await self._tool_recommendation(
                 plan.current_task.instruction, code_steps, available_tools
             )
-            tool_catalog = self._parse_recommend_tools(tool_type, recommend_tools)
+            tool_catalog = self._parse_recommend_tools(recommend_tools)
             logger.info(f"Recommended tools: \n{recommend_tools}")
 
-            module_name = TOOL_TYPE_MODULE[tool_type]
-
-        tools_instruction = TOOL_USAGE_PROMPT.format(
-            special_prompt=special_prompt, module_name=module_name, tool_catalog=tool_catalog
-        )
+        tools_instruction = TOOL_USAGE_PROMPT.format(special_prompt=special_prompt, tool_catalog=tool_catalog)
 
         context.append(Message(content=tools_instruction, role="user"))
 
diff --git a/metagpt/actions/write_plan.py b/metagpt/actions/write_plan.py
index c7ef541b9..60dcef43b 100644
--- a/metagpt/actions/write_plan.py
+++ b/metagpt/actions/write_plan.py
@@ -12,7 +12,7 @@ from metagpt.actions import Action
 from metagpt.logs import logger
 from metagpt.prompts.ml_engineer import ASSIGN_TASK_TYPE_CONFIG, ASSIGN_TASK_TYPE_PROMPT
 from metagpt.schema import Message, Plan, Task
-from metagpt.tools import TOOL_TYPE_MAPPINGS
+from metagpt.tools import TOOL_REGISTRY
 from metagpt.utils.common import CodeParser, create_func_config
 
 
@@ -47,13 +47,16 @@ class WritePlan(Action):
             List[Dict]: tasks with task type assigned
         """
         task_list = "\n".join([f"Task {task['task_id']}: {task['instruction']}" for task in tasks])
-        task_type_desc = "\n".join([f"- **{item.name}**: {item.desc}" for item in TOOL_TYPE_MAPPINGS.values()])
+        task_type_desc = "\n".join(
+            [f"- **{tool_type.name}**: {tool_type.desc}" for tool_type in TOOL_REGISTRY.get_tool_types().values()]
+        )  # task type are binded with tool type now, should be improved in the future
         prompt = ASSIGN_TASK_TYPE_PROMPT.format(
             task_list=task_list, task_type_desc=task_type_desc
         )  # task types are set to be the same as tool types, for now
         tool_config = create_func_config(ASSIGN_TASK_TYPE_CONFIG)
         rsp = await self.llm.aask_code(prompt, **tool_config)
         task_type_list = rsp["task_type"]
+        print(f"assigned task types: {task_type_list}")
         for task, task_type in zip(tasks, task_type_list):
             task["task_type"] = task_type
         return json.dumps(tasks)
diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py
index 3baf79843..31d754a9e 100644
--- a/metagpt/prompts/ml_engineer.py
+++ b/metagpt/prompts/ml_engineer.py
@@ -203,7 +203,7 @@ Specifically, {special_prompt}
 - You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..
 
 # Available Tools (can be empty):
-Each Class tool is described in JSON format. When you call a tool, import the tool from `{module_name}` first.
+Each Class tool is described in JSON format. When you call a tool, import the tool first.
 {tool_catalog}
 
 # Constraints:
diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py
index 222edf312..f743d63c7 100644
--- a/metagpt/tools/__init__.py
+++ b/metagpt/tools/__init__.py
@@ -8,17 +8,6 @@
 
 from enum import Enum
 
-from pydantic import BaseModel
-
-from metagpt.const import TOOL_LIBS_PATH
-from metagpt.prompts.tool_type import (
-    DATA_PREPROCESS_PROMPT,
-    FEATURE_ENGINEERING_PROMPT,
-    MODEL_TRAIN_PROMPT,
-    MODEL_EVALUATE_PROMPT,
-    VISION_PROMPT,
-)
-
 
 class SearchEngineType(Enum):
     SERPAPI_GOOGLE = "serpapi"
diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py
index f423f2020..59ede3ffc 100644
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@@ -14,8 +14,13 @@ from sklearn.preprocessing import (
 )
 
 from metagpt.tools.functions.libs.base import MLProcess
+from metagpt.tools.tool_registry import register_tool
+from metagpt.tools.tool_schema import ToolTypeEnum
+
+TOOL_TYPE = ToolTypeEnum.DATA_PREPROCESS.value
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class FillMissingValue(MLProcess):
     def __init__(
         self,
@@ -42,6 +47,7 @@ class FillMissingValue(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class MinMaxScale(MLProcess):
     def __init__(
         self,
@@ -60,6 +66,7 @@ class MinMaxScale(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class StandardScale(MLProcess):
     def __init__(
         self,
@@ -78,6 +85,7 @@ class StandardScale(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class MaxAbsScale(MLProcess):
     def __init__(
         self,
@@ -96,6 +104,7 @@ class MaxAbsScale(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class RobustScale(MLProcess):
     def __init__(
         self,
@@ -114,6 +123,7 @@ class RobustScale(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class OrdinalEncode(MLProcess):
     def __init__(
         self,
@@ -132,6 +142,7 @@ class OrdinalEncode(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class OneHotEncode(MLProcess):
     def __init__(
         self,
@@ -153,6 +164,7 @@ class OneHotEncode(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class LabelEncode(MLProcess):
     def __init__(
         self,
@@ -181,6 +193,7 @@ class LabelEncode(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 def get_column_info(df: pd.DataFrame) -> dict:
     column_info = {
         "Category": [],
diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py
index 0d9584b4a..8b96cbd07 100644
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@@ -6,7 +6,7 @@
 # @Desc    : Feature Engineering Tools
 import itertools
 
-import lightgbm as lgb
+# import lightgbm as lgb
 import numpy as np
 import pandas as pd
 from joblib import Parallel, delayed
@@ -16,8 +16,13 @@ from sklearn.model_selection import KFold
 from sklearn.preprocessing import KBinsDiscretizer, PolynomialFeatures
 
 from metagpt.tools.functions.libs.base import MLProcess
+from metagpt.tools.tool_registry import register_tool
+from metagpt.tools.tool_schema import ToolTypeEnum
+
+TOOL_TYPE = ToolTypeEnum.FEATURE_ENGINEERING.value
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class PolynomialExpansion(MLProcess):
     def __init__(self, cols: list, degree: int = 2, label_col: str = None):
         self.cols = cols
@@ -48,6 +53,7 @@ class PolynomialExpansion(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class CatCount(MLProcess):
     def __init__(self, col: str):
         self.col = col
@@ -62,6 +68,7 @@ class CatCount(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class TargetMeanEncoder(MLProcess):
     def __init__(self, col: str, label: str):
         self.col = col
@@ -77,6 +84,7 @@ class TargetMeanEncoder(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class KFoldTargetMeanEncoder(MLProcess):
     def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
         self.col = col
@@ -103,6 +111,7 @@ class KFoldTargetMeanEncoder(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class CatCross(MLProcess):
     def __init__(self, cols: list, max_cat_num: int = 100):
         self.cols = cols
@@ -138,6 +147,7 @@ class CatCross(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class GroupStat(MLProcess):
     def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
         self.group_col = group_col
@@ -157,6 +167,7 @@ class GroupStat(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class SplitBins(MLProcess):
     def __init__(self, cols: list, strategy: str = "quantile"):
         self.cols = cols
@@ -173,6 +184,7 @@ class SplitBins(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class ExtractTimeComps(MLProcess):
     def __init__(self, time_col: str, time_comps: list):
         self.time_col = time_col
@@ -201,6 +213,7 @@ class ExtractTimeComps(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class GeneralSelection(MLProcess):
     def __init__(self, label_col: str):
         self.label_col = label_col
@@ -228,6 +241,7 @@ class GeneralSelection(MLProcess):
         return new_df
 
 
+# skip for now because lgb is needed
 class TreeBasedSelection(MLProcess):
     def __init__(self, label_col: str, task_type: str):
         self.label_col = label_col
@@ -270,6 +284,7 @@ class TreeBasedSelection(MLProcess):
         return new_df
 
 
+@register_tool(tool_type_name=TOOL_TYPE)
 class VarianceBasedSelection(MLProcess):
     def __init__(self, label_col: str, threshold: float = 0):
         self.label_col = label_col
diff --git a/metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml b/metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml
new file mode 100644
index 000000000..44c830a1e
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml
@@ -0,0 +1,61 @@
+FillMissingValue:
+  type: class
+  description: "Completing missing values with simple strategies"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+          strategy:
+            type: str
+            description: "the imputation strategy, notice mean/median can only be used for numeric features"
+            default: mean
+            enum:
+              - mean
+              - median
+              - most_frequent
+              - constant
+          fill_value:
+            type: int
+            description: "fill_value is used to replace all occurrences of missing_values"
+            default: null
+        required:
+          - features
+    fit:
+      description: "Fit the FillMissingValue model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml b/metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml
new file mode 100644
index 000000000..419ef60a8
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml
@@ -0,0 +1,48 @@
+LabelEncode:
+  type: class
+  description: "Apply label encoding to specified categorical columns in-place."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "Categorical columns to be label encoded"
+        required:
+          - features
+    fit:
+      description: "Fit the LabelEncode model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml b/metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml
new file mode 100644
index 000000000..3e17cfdd0
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml
@@ -0,0 +1,48 @@
+MaxAbsScale:
+  type: class
+  description: "cale each feature by its maximum absolute value"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the MaxAbsScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml b/metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml
new file mode 100644
index 000000000..8f050d942
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml
@@ -0,0 +1,48 @@
+MinMaxScale:
+  type: class
+  description: "Transform features by scaling each feature to a range, witch is (0, 1)"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the MinMaxScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml b/metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml
new file mode 100644
index 000000000..f499b2cb8
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml
@@ -0,0 +1,48 @@
+OneHotEncode:
+  type: class
+  description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "Categorical columns to be one-hot encoded and dropped"
+        required:
+          - features
+    fit:
+      description: "Fit the OneHotEncoding model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml b/metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml
new file mode 100644
index 000000000..cf6e7d57b
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml
@@ -0,0 +1,48 @@
+StandardScale:
+  type: class
+  description: "Standardize features by removing the mean and scaling to unit variance"
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          features:
+            type: list
+            description: "columns to be processed"
+        required:
+          - features
+    fit:
+      description: "Fit the StandardScale model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
diff --git a/metagpt/tools/functions/schemas/feature_engineering/CatCount.yml b/metagpt/tools/functions/schemas/feature_engineering/CatCount.yml
new file mode 100644
index 000000000..049fc7879
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/CatCount.yml
@@ -0,0 +1,48 @@
+CatCount:
+  type: class
+  description: "Add value counts of a categorical column as new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column for value counts."
+        required:
+          - col
+    fit:
+      description: "Fit the CatCount model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/CatCross.yml b/metagpt/tools/functions/schemas/feature_engineering/CatCross.yml
new file mode 100644
index 000000000..5d6303439
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/CatCross.yml
@@ -0,0 +1,52 @@
+CatCross:
+  type: class
+  description: "Add pairwise crossed features and convert them to numerical features."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be pairwise crossed, at least 2 columns."
+          max_cat_num:
+            type: int
+            description: "Maximum unique categories per crossed feature."
+            default: 100
+      required:
+        - cols
+    fit:
+      description: "Fit the CatCross model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml b/metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml
new file mode 100644
index 000000000..2ebf5b397
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml
@@ -0,0 +1,48 @@
+GeneralSelection:
+  type: class
+  description: "Drop all nan feats and feats with only one unique value."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+        required:
+          - label_col
+    fit:
+      description: "Fit the GeneralSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml b/metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml
new file mode 100644
index 000000000..6e0ba2877
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml
@@ -0,0 +1,58 @@
+GroupStat:
+  type: class
+  description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          group_col:
+            type: str
+            description: "Column used for grouping."
+          agg_col:
+            type: str
+            description: "Column on which aggregation is performed."
+          agg_funcs:
+            type: list
+            description: >-
+              List of aggregation functions to apply, such as ['mean', 'std'].
+              Each function must be supported by pandas.
+        required:
+          - group_col
+          - agg_col
+          - agg_funcs
+    fit:
+      description: "Fit the GroupStat model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml b/metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
new file mode 100644
index 000000000..79a673f9f
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
@@ -0,0 +1,60 @@
+KFoldTargetMeanEncoder:
+  type: class
+  description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be k-fold mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+          n_splits:
+            type: int
+            description: "Number of splits for K-fold."
+            default: 5
+          random_state:
+            type: int
+            description: "Random seed."
+            default: 2021
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the KFoldTargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml b/metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml
new file mode 100644
index 000000000..62e6ad5b3
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml
@@ -0,0 +1,548 @@
+PolynomialExpansion:
+  type: class
+  description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns for polynomial expansion."
+          label_col:
+            type: str
+            description: "Label column name."
+          degree:
+            type: int
+            description: "The degree of the polynomial features."
+            default: 2
+        required:
+          - cols
+          - label_col
+    fit:
+      description: "Fit the PolynomialExpansion model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame without duplicated columns."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame without duplicated columns."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+CatCount:
+  type: class
+  description: "Add value counts of a categorical column as new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column for value counts."
+        required:
+          - col
+    fit:
+      description: "Fit the CatCount model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+TargetMeanEncoder:
+  type: class
+  description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the TargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+KFoldTargetMeanEncoder:
+  type: class
+  description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be k-fold mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+          n_splits:
+            type: int
+            description: "Number of splits for K-fold."
+            default: 5
+          random_state:
+            type: int
+            description: "Random seed."
+            default: 2021
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the KFoldTargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+CatCross:
+  type: class
+  description: "Add pairwise crossed features and convert them to numerical features."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be pairwise crossed, at least 2 columns."
+          max_cat_num:
+            type: int
+            description: "Maximum unique categories per crossed feature."
+            default: 100
+      required:
+        - cols
+    fit:
+      description: "Fit the CatCross model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+GroupStat:
+  type: class
+  description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          group_col:
+            type: str
+            description: "Column used for grouping."
+          agg_col:
+            type: str
+            description: "Column on which aggregation is performed."
+          agg_funcs:
+            type: list
+            description: >-
+              List of aggregation functions to apply, such as ['mean', 'std'].
+              Each function must be supported by pandas.
+        required:
+          - group_col
+          - agg_col
+          - agg_funcs
+    fit:
+      description: "Fit the GroupStat model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+SplitBins:
+  type: class
+  description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be binned inplace."
+          strategy:
+            type: str
+            description: "Strategy used to define the widths of the bins."
+            default: quantile
+            enum:
+              - quantile
+              - uniform
+              - kmeans
+        required:
+          - cols
+    fit:
+      description: "Fit the SplitBins model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+GeneralSelection:
+  type: class
+  description: "Drop all nan feats and feats with only one unique value."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+        required:
+          - label_col
+    fit:
+      description: "Fit the GeneralSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+
+
+TreeBasedSelection:
+  type: class
+  description: "Select features based on tree-based model and remove features with low importance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          task_type:
+            type: str
+            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
+            enum:
+              - cls
+              - mcls
+              - reg
+        required:
+          - label_col
+          - task_type
+    fit:
+      description: "Fit the TreeBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+
+VarianceBasedSelection:
+  type: class
+  description: "Select features based on variance and remove features with low variance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          threshold:
+            type: float
+            description: "Threshold for variance."
+            default: 0.0
+        required:
+          - label_col
+    fit:
+      description: "Fit the VarianceBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml b/metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml
new file mode 100644
index 000000000..4e0171406
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml
@@ -0,0 +1,56 @@
+SplitBins:
+  type: class
+  description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          cols:
+            type: list
+            description: "Columns to be binned inplace."
+          strategy:
+            type: str
+            description: "Strategy used to define the widths of the bins."
+            default: quantile
+            enum:
+              - quantile
+              - uniform
+              - kmeans
+        required:
+          - cols
+    fit:
+      description: "Fit the SplitBins model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml b/metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml
new file mode 100644
index 000000000..86416ccbb
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml
@@ -0,0 +1,52 @@
+TargetMeanEncoder:
+  type: class
+  description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          col:
+            type: str
+            description: "Column to be mean encoded."
+          label:
+            type: str
+            description: "Predicted label column."
+        required:
+          - col
+          - label
+    fit:
+      description: "Fit the TargetMeanEncoder model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml b/metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml
new file mode 100644
index 000000000..c210effea
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml
@@ -0,0 +1,56 @@
+TreeBasedSelection:
+  type: class
+  description: "Select features based on tree-based model and remove features with low importance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          task_type:
+            type: str
+            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
+            enum:
+              - cls
+              - mcls
+              - reg
+        required:
+          - label_col
+          - task_type
+    fit:
+      description: "Fit the TreeBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml b/metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml
new file mode 100644
index 000000000..6da4c3e7f
--- /dev/null
+++ b/metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml
@@ -0,0 +1,52 @@
+VarianceBasedSelection:
+  type: class
+  description: "Select features based on variance and remove features with low variance."
+  methods:
+    __init__:
+      description: "Initialize self."
+      parameters:
+        properties:
+          label_col:
+            type: str
+            description: "Label column name."
+          threshold:
+            type: float
+            description: "Threshold for variance."
+            default: 0.0
+        required:
+          - label_col
+    fit:
+      description: "Fit the VarianceBasedSelection model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+    transform:
+      description: "Transform the input DataFrame with the fitted model."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
+    fit_transform:
+      description: "Fit and transform the input DataFrame."
+      parameters:
+        properties:
+          df:
+            type: DataFrame
+            description: "The input DataFrame."
+        required:
+          - df
+      returns:
+        df:
+          type: DataFrame
+          description: "The transformed DataFrame contain label_col."
\ No newline at end of file
diff --git a/metagpt/tools/tool_registry.py b/metagpt/tools/tool_registry.py
new file mode 100644
index 000000000..201c63c71
--- /dev/null
+++ b/metagpt/tools/tool_registry.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2023/01/12 17:07
+@Author  : garylin2099
+@File    : tool_registry.py
+"""
+import os
+from collections import defaultdict
+import inspect
+import re
+
+import yaml
+
+from metagpt.tools.tool_schema import ToolType, ToolSchema, Tool
+from metagpt.logs import logger
+from metagpt.const import TOOL_SCHEMA_PATH
+
+
+class ToolRegistry:
+    def __init__(self):
+        self.tools = {}
+        self.tool_types = {}
+        self.tools_by_types = defaultdict(
+            dict
+        )  # two-layer k-v, {tool_type_name: {tool_name: {...}, ...}, ...}
+
+    def register_tool_type(self, tool_type: ToolType):
+        self.tool_types[tool_type.name] = tool_type
+
+    def register_tool(
+        self,
+        tool_name,
+        tool_path,
+        schema_path=None,
+        tool_code="",
+        tool_type_name="other",
+        make_schema_if_not_exists=False,
+    ):
+        if self.has_tool(tool_name):
+            return
+
+        schema_path = schema_path or TOOL_SCHEMA_PATH / tool_type_name / f"{tool_name}.yml"
+
+        if not os.path.exists(schema_path):
+            if make_schema_if_not_exists:
+                logger.warning(f"no schema found, will make schema at {schema_path}")
+                make_schema(tool_code, schema_path)
+            else:
+                logger.warning(f"no schema found at assumed schema_path {schema_path}, skip registering {tool_name}")
+                return
+
+        with open(schema_path, "r", encoding="utf-8") as f:
+            schema = yaml.safe_load(f)[tool_name]
+        schema["tool_path"] = tool_path  # corresponding code file path of the tool
+        try:
+            ToolSchema(**schema)  # validation
+        except Exception as e:
+            pass
+            # logger.warning(
+            #     f"{tool_name} schema not conforms to required format, but will be used anyway. Mismatch: {e}"
+            # )
+        tool = Tool(name=tool_name, path=tool_path, schema=schema, code=tool_code)
+        self.tools[tool_name] = tool
+        self.tools_by_types[tool_type_name][tool_name] = tool
+        logger.info(f"{tool_name} registered")
+
+    def has_tool(self, key):
+        return key in self.tools
+    
+    def get_tool(self, key):
+        return self.tools.get(key)
+    
+    def get_tools_by_type(self, key):
+        return self.tools_by_types.get(key)
+    
+    def has_tool_type(self, key):
+        return key in self.tool_types
+
+    def get_tool_type(self, key):
+        return self.tool_types.get(key)
+    
+    def get_tool_types(self):
+        return self.tool_types
+
+
+# Registry instance
+TOOL_REGISTRY = ToolRegistry()
+
+
+def register_tool_type(cls):
+    """register a tool type to registry"""
+    TOOL_REGISTRY.register_tool_type(tool_type=cls())
+    return cls
+
+
+def register_tool(tool_name="", tool_type_name="other", schema_path=None):
+    """register a tool to registry"""
+
+    def decorator(cls, tool_name=tool_name):
+        tool_name = tool_name or cls.__name__
+        
+        # Get the file path where the function / class is defined and the source code
+        file_path = inspect.getfile(cls)
+        if "metagpt" in file_path:
+            file_path = re.search("metagpt.+", file_path).group(0)
+        source_code = inspect.getsource(cls)
+
+        TOOL_REGISTRY.register_tool(
+            tool_name=tool_name,
+            tool_path=file_path,
+            schema_path=schema_path,
+            tool_code=source_code,
+            tool_type_name=tool_type_name,
+        )
+        return cls
+
+    return decorator
+
+
+def make_schema(tool_code, path):
+    os.makedirs(
+        os.path.dirname(path), exist_ok=True
+    )  # Create the necessary directories
+    schema = {}  # an empty schema for now
+    with open(path, "w", encoding="utf-8") as f:
+        yaml.dump(schema, f)
+    return path
diff --git a/metagpt/tools/tool_schema.py b/metagpt/tools/tool_schema.py
new file mode 100644
index 000000000..2b90996e5
--- /dev/null
+++ b/metagpt/tools/tool_schema.py
@@ -0,0 +1,31 @@
+from enum import Enum
+
+from pydantic import BaseModel
+
+
+class ToolTypeEnum(Enum):
+    DATA_PREPROCESS = "data_preprocess"
+    FEATURE_ENGINEERING = "feature_engineering"
+    MODEL_TRAIN = "model_train"
+    MODEL_EVALUATE = "model_evaluate"
+    OTHER = "other"
+
+    def __missing__(self, key):
+        return self.OTHER
+
+
+class ToolType(BaseModel):
+    name: str
+    desc: str
+    usage_prompt: str = ""
+
+
+class ToolSchema(BaseModel):
+    name: str
+
+
+class Tool(BaseModel):
+    name: str
+    path: str
+    schema: dict = {}
+    code: str = ""
diff --git a/metagpt/tools/tool_types.py b/metagpt/tools/tool_types.py
new file mode 100644
index 000000000..9104f90b8
--- /dev/null
+++ b/metagpt/tools/tool_types.py
@@ -0,0 +1,43 @@
+from metagpt.prompts.tool_type import (
+    DATA_PREPROCESS_PROMPT,
+    FEATURE_ENGINEERING_PROMPT,
+    MODEL_TRAIN_PROMPT,
+    MODEL_EVALUATE_PROMPT,
+)
+from metagpt.tools.tool_schema import ToolTypeEnum, ToolType
+from metagpt.tools.tool_registry import register_tool_type
+
+
+@register_tool_type
+class DataPreprocess(ToolType):
+    name: str = ToolTypeEnum.DATA_PREPROCESS.value
+    desc: str = "Only for changing value inplace."
+    usage_prompt: str = DATA_PREPROCESS_PROMPT
+
+
+@register_tool_type
+class FeatureEngineer(ToolType):
+    name: str = ToolTypeEnum.FEATURE_ENGINEERING.value
+    desc: str = "Only for creating new columns for input data."
+    usage_prompt: str = FEATURE_ENGINEERING_PROMPT
+
+
+@register_tool_type
+class ModelTrain(ToolType):
+    name: str = ToolTypeEnum.MODEL_TRAIN.value
+    desc: str = "Only for training model."
+    usage_prompt: str = MODEL_TRAIN_PROMPT
+
+
+@register_tool_type
+class ModelEvaluate(ToolType):
+    name: str = ToolTypeEnum.MODEL_EVALUATE.value
+    desc: str = "Only for evaluating model."
+    usage_prompt: str = MODEL_EVALUATE_PROMPT
+
+
+@register_tool_type
+class Other(ToolType):
+    name: str = ToolTypeEnum.OTHER.value
+    desc: str = "Any tools not in the defined categories"
+    usage_prompt: str = ""

From d7ab4d315dd1a58c696733d4912891f1fc7e58d6 Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Sat, 13 Jan 2024 12:28:52 +0800
Subject: [PATCH 02/12] renaming and integrate sd tool, fix import issue

---
 metagpt/tools/__init__.py                     | 66 ++-----------------
 metagpt/tools/functions/libs/__init__.py      |  7 ++
 .../tools/functions/libs/data_preprocess.py   |  2 +-
 .../functions/libs/feature_engineering.py     |  2 +-
 metagpt/tools/sd_engine.py                    |  3 +
 .../{tool_schema.py => tool_data_type.py}     |  1 +
 metagpt/tools/tool_registry.py                | 29 ++++----
 metagpt/tools/tool_types.py                   | 11 +++-
 8 files changed, 41 insertions(+), 80 deletions(-)
 rename metagpt/tools/{tool_schema.py => tool_data_type.py} (92%)

diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py
index f743d63c7..4ca46fc89 100644
--- a/metagpt/tools/__init__.py
+++ b/metagpt/tools/__init__.py
@@ -7,6 +7,13 @@
 """
 
 from enum import Enum
+from metagpt.tools import tool_types  # this registers all tool types
+from metagpt.tools.functions import libs  # this registers all tools
+from metagpt.tools.tool_registry import TOOL_REGISTRY
+
+_ = tool_types  # Avoid pre-commit error
+_ = libs  # Avoid pre-commit error
+_ = TOOL_REGISTRY  # Avoid pre-commit error
 
 
 class SearchEngineType(Enum):
@@ -26,62 +33,3 @@ class WebBrowserEngineType(Enum):
     def __missing__(cls, key):
         """Default type conversion"""
         return cls.CUSTOM
-
-
-class ToolType(BaseModel):
-    name: str
-    module: str = ""
-    desc: str
-    usage_prompt: str = ""
-
-
-TOOL_TYPE_MAPPINGS = {
-    "data_preprocess": ToolType(
-        name="data_preprocess",
-        module=str(TOOL_LIBS_PATH / "data_preprocess"),
-        desc="Only for changing value inplace.",
-        usage_prompt=DATA_PREPROCESS_PROMPT,
-    ),
-    "feature_engineering": ToolType(
-        name="feature_engineering",
-        module=str(TOOL_LIBS_PATH / "feature_engineering"),
-        desc="Only for creating new columns for input data.",
-        usage_prompt=FEATURE_ENGINEERING_PROMPT,
-    ),
-    "model_train": ToolType(
-        name="model_train",
-        module="",
-        desc="Only for training model.",
-        usage_prompt=MODEL_TRAIN_PROMPT,
-    ),
-    "model_evaluate": ToolType(
-        name="model_evaluate",
-        module="",
-        desc="Only for evaluating model.",
-        usage_prompt=MODEL_EVALUATE_PROMPT,
-    ),
-    "stable_diffusion": ToolType(
-        name="stable_diffusion",
-        module="metagpt.tools.sd_engine",
-        desc="Related to text2image, image2image using stable diffusion model.",
-        usage_prompt="",
-    ),
-    "scrape_web": ToolType(
-        name="scrape_web",
-        module="metagpt.tools.functions.libs.scrape_web.scrape_web",
-        desc="Scrape data from web page.",
-        usage_prompt="",
-    ),
-    "vision": ToolType(
-        name="vision",
-        module=str(TOOL_LIBS_PATH / "vision"),
-        desc="Only for converting image into webpage code.",
-        usage_prompt=VISION_PROMPT,
-    ),
-    "other": ToolType(
-        name="other",
-        module="",
-        desc="Any tasks that do not fit into the previous categories",
-        usage_prompt="",
-    ),
-}
diff --git a/metagpt/tools/functions/libs/__init__.py b/metagpt/tools/functions/libs/__init__.py
index a0a43f507..f0a61a7d9 100644
--- a/metagpt/tools/functions/libs/__init__.py
+++ b/metagpt/tools/functions/libs/__init__.py
@@ -4,3 +4,10 @@
 # @Author  : lidanyang
 # @File    : __init__.py
 # @Desc    :
+from metagpt.tools.functions.libs import (
+    data_preprocess,
+    feature_engineering,
+)
+
+_ = data_preprocess  # Avoid pre-commit error
+_ = feature_engineering  # Avoid pre-commit error
diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py
index 59ede3ffc..019ffd34e 100644
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@@ -14,8 +14,8 @@ from sklearn.preprocessing import (
 )
 
 from metagpt.tools.functions.libs.base import MLProcess
+from metagpt.tools.tool_data_type import ToolTypeEnum
 from metagpt.tools.tool_registry import register_tool
-from metagpt.tools.tool_schema import ToolTypeEnum
 
 TOOL_TYPE = ToolTypeEnum.DATA_PREPROCESS.value
 
diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py
index 8b96cbd07..cd03592a6 100644
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@@ -16,8 +16,8 @@ from sklearn.model_selection import KFold
 from sklearn.preprocessing import KBinsDiscretizer, PolynomialFeatures
 
 from metagpt.tools.functions.libs.base import MLProcess
+from metagpt.tools.tool_data_type import ToolTypeEnum
 from metagpt.tools.tool_registry import register_tool
-from metagpt.tools.tool_schema import ToolTypeEnum
 
 TOOL_TYPE = ToolTypeEnum.FEATURE_ENGINEERING.value
 
diff --git a/metagpt/tools/sd_engine.py b/metagpt/tools/sd_engine.py
index ba61fd496..2e3f36ef8 100644
--- a/metagpt/tools/sd_engine.py
+++ b/metagpt/tools/sd_engine.py
@@ -16,6 +16,8 @@ from PIL import Image, PngImagePlugin
 from metagpt.config import CONFIG
 from metagpt.const import SD_OUTPUT_FILE_REPO
 from metagpt.logs import logger
+from metagpt.tools.tool_data_type import ToolTypeEnum
+from metagpt.tools.tool_registry import register_tool
 
 payload = {
     "prompt": "",
@@ -51,6 +53,7 @@ payload = {
 default_negative_prompt = "(easynegative:0.8),black, dark,Low resolution"
 
 
+@register_tool(tool_type_name=ToolTypeEnum.STABLE_DIFFUSION)
 class SDEngine:
     def __init__(self, sd_url=""):
         # Initialize the SDEngine with configuration
diff --git a/metagpt/tools/tool_schema.py b/metagpt/tools/tool_data_type.py
similarity index 92%
rename from metagpt/tools/tool_schema.py
rename to metagpt/tools/tool_data_type.py
index 2b90996e5..c767fef9b 100644
--- a/metagpt/tools/tool_schema.py
+++ b/metagpt/tools/tool_data_type.py
@@ -8,6 +8,7 @@ class ToolTypeEnum(Enum):
     FEATURE_ENGINEERING = "feature_engineering"
     MODEL_TRAIN = "model_train"
     MODEL_EVALUATE = "model_evaluate"
+    STABLE_DIFFUSION = "stable_diffusion"
     OTHER = "other"
 
     def __missing__(self, key):
diff --git a/metagpt/tools/tool_registry.py b/metagpt/tools/tool_registry.py
index 201c63c71..e6519bba9 100644
--- a/metagpt/tools/tool_registry.py
+++ b/metagpt/tools/tool_registry.py
@@ -5,28 +5,27 @@
 @Author  : garylin2099
 @File    : tool_registry.py
 """
-import os
-from collections import defaultdict
 import inspect
+import os
 import re
+from collections import defaultdict
 
 import yaml
 
-from metagpt.tools.tool_schema import ToolType, ToolSchema, Tool
-from metagpt.logs import logger
 from metagpt.const import TOOL_SCHEMA_PATH
+from metagpt.logs import logger
+from metagpt.tools.tool_data_type import Tool, ToolSchema, ToolType
 
 
 class ToolRegistry:
     def __init__(self):
         self.tools = {}
         self.tool_types = {}
-        self.tools_by_types = defaultdict(
-            dict
-        )  # two-layer k-v, {tool_type_name: {tool_name: {...}, ...}, ...}
+        self.tools_by_types = defaultdict(dict)  # two-layer k-v, {tool_type_name: {tool_name: {...}, ...}, ...}
 
     def register_tool_type(self, tool_type: ToolType):
         self.tool_types[tool_type.name] = tool_type
+        logger.info(f"{tool_type.name} registered")
 
     def register_tool(
         self,
@@ -55,7 +54,7 @@ class ToolRegistry:
         schema["tool_path"] = tool_path  # corresponding code file path of the tool
         try:
             ToolSchema(**schema)  # validation
-        except Exception as e:
+        except Exception:
             pass
             # logger.warning(
             #     f"{tool_name} schema not conforms to required format, but will be used anyway. Mismatch: {e}"
@@ -67,19 +66,19 @@ class ToolRegistry:
 
     def has_tool(self, key):
         return key in self.tools
-    
+
     def get_tool(self, key):
         return self.tools.get(key)
-    
+
     def get_tools_by_type(self, key):
         return self.tools_by_types.get(key)
-    
+
     def has_tool_type(self, key):
         return key in self.tool_types
 
     def get_tool_type(self, key):
         return self.tool_types.get(key)
-    
+
     def get_tool_types(self):
         return self.tool_types
 
@@ -99,7 +98,7 @@ def register_tool(tool_name="", tool_type_name="other", schema_path=None):
 
     def decorator(cls, tool_name=tool_name):
         tool_name = tool_name or cls.__name__
-        
+
         # Get the file path where the function / class is defined and the source code
         file_path = inspect.getfile(cls)
         if "metagpt" in file_path:
@@ -119,9 +118,7 @@ def register_tool(tool_name="", tool_type_name="other", schema_path=None):
 
 
 def make_schema(tool_code, path):
-    os.makedirs(
-        os.path.dirname(path), exist_ok=True
-    )  # Create the necessary directories
+    os.makedirs(os.path.dirname(path), exist_ok=True)  # Create the necessary directories
     schema = {}  # an empty schema for now
     with open(path, "w", encoding="utf-8") as f:
         yaml.dump(schema, f)
diff --git a/metagpt/tools/tool_types.py b/metagpt/tools/tool_types.py
index 9104f90b8..97eb574da 100644
--- a/metagpt/tools/tool_types.py
+++ b/metagpt/tools/tool_types.py
@@ -1,10 +1,10 @@
 from metagpt.prompts.tool_type import (
     DATA_PREPROCESS_PROMPT,
     FEATURE_ENGINEERING_PROMPT,
-    MODEL_TRAIN_PROMPT,
     MODEL_EVALUATE_PROMPT,
+    MODEL_TRAIN_PROMPT,
 )
-from metagpt.tools.tool_schema import ToolTypeEnum, ToolType
+from metagpt.tools.tool_data_type import ToolType, ToolTypeEnum
 from metagpt.tools.tool_registry import register_tool_type
 
 
@@ -36,8 +36,13 @@ class ModelEvaluate(ToolType):
     usage_prompt: str = MODEL_EVALUATE_PROMPT
 
 
+@register_tool_type
+class StableDiffusion(ToolType):
+    name: str = ToolTypeEnum.STABLE_DIFFUSION.value
+    desc: str = "Related to text2image, image2image using stable diffusion model."
+
+
 @register_tool_type
 class Other(ToolType):
     name: str = ToolTypeEnum.OTHER.value
     desc: str = "Any tools not in the defined categories"
-    usage_prompt: str = ""

From c8da839afe8f74a3837c49da9a332b415f7e5972 Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Mon, 15 Jan 2024 11:07:29 +0800
Subject: [PATCH 03/12] moving files

---
 .gitignore                                    |   1 +
 docs/FAQ-EN.md                                |   2 +-
 metagpt/const.py                              |   4 +-
 metagpt/prompts/ml_engineer.py                |   4 +-
 metagpt/tools/__init__.py                     |   2 +-
 metagpt/tools/functions/__init__.py           |   6 -
 metagpt/tools/functions/libs/base.py          |  16 -
 metagpt/tools/functions/libs/udf/__init__.py  | 126 ----
 .../functions/schemas/data_preprocess.yml     | 306 ----------
 .../functions/schemas/feature_engineering.yml | 548 ------------------
 .../tools/{functions => }/libs/__init__.py    |   2 +-
 .../{functions => }/libs/data_preprocess.py   |  13 +-
 .../libs/feature_engineering.py               |   2 +-
 metagpt/tools/{ => libs}/sd_engine.py         |   2 +-
 .../tools/{functions => }/schemas/__init__.py |   0
 .../data_preprocess/FillMissingValue.yml      |   0
 .../schemas/data_preprocess/LabelEncode.yml   |   0
 .../schemas/data_preprocess/MaxAbsScale.yml   |   0
 .../schemas/data_preprocess/MinMaxScale.yml   |   0
 .../schemas/data_preprocess/OneHotEncode.yml  |   0
 .../schemas/data_preprocess/StandardScale.yml |   0
 .../schemas/feature_engineering/CatCount.yml  |   0
 .../schemas/feature_engineering/CatCross.yml  |   0
 .../feature_engineering/GeneralSelection.yml  |   0
 .../schemas/feature_engineering/GroupStat.yml |   0
 .../KFoldTargetMeanEncoder.yml                |   0
 .../PolynomialExpansion.yml                   |   0
 .../schemas/feature_engineering/SplitBins.yml |   0
 .../feature_engineering/TargetMeanEncoder.yml |   0
 .../TreeBasedSelection.yml                    |   0
 .../VarianceBasedSelection.yml                |   0
 .../stable_diffusion/SDEngine.yml}            |   0
 tests/metagpt/tools/functions/__init__.py     |   6 -
 .../tools/{functions => }/libs/__init__.py    |   0
 .../libs/test_data_preprocess.py              |   2 +-
 .../libs/test_feature_engineering.py          |   3 +-
 .../tools/{functions => libs}/test_sd.py      |   2 +-
 .../tools/{functions => libs}/test_udf.py     |   2 +-
 38 files changed, 27 insertions(+), 1022 deletions(-)
 delete mode 100644 metagpt/tools/functions/__init__.py
 delete mode 100644 metagpt/tools/functions/libs/base.py
 delete mode 100644 metagpt/tools/functions/libs/udf/__init__.py
 delete mode 100644 metagpt/tools/functions/schemas/data_preprocess.yml
 delete mode 100644 metagpt/tools/functions/schemas/feature_engineering.yml
 rename metagpt/tools/{functions => }/libs/__init__.py (86%)
 rename metagpt/tools/{functions => }/libs/data_preprocess.py (96%)
 rename metagpt/tools/{functions => }/libs/feature_engineering.py (99%)
 rename metagpt/tools/{ => libs}/sd_engine.py (98%)
 rename metagpt/tools/{functions => }/schemas/__init__.py (100%)
 rename metagpt/tools/{functions => }/schemas/data_preprocess/FillMissingValue.yml (100%)
 rename metagpt/tools/{functions => }/schemas/data_preprocess/LabelEncode.yml (100%)
 rename metagpt/tools/{functions => }/schemas/data_preprocess/MaxAbsScale.yml (100%)
 rename metagpt/tools/{functions => }/schemas/data_preprocess/MinMaxScale.yml (100%)
 rename metagpt/tools/{functions => }/schemas/data_preprocess/OneHotEncode.yml (100%)
 rename metagpt/tools/{functions => }/schemas/data_preprocess/StandardScale.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/CatCount.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/CatCross.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/GeneralSelection.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/GroupStat.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/KFoldTargetMeanEncoder.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/PolynomialExpansion.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/SplitBins.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/TargetMeanEncoder.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/TreeBasedSelection.yml (100%)
 rename metagpt/tools/{functions => }/schemas/feature_engineering/VarianceBasedSelection.yml (100%)
 rename metagpt/tools/{functions/schemas/stable_diffusion.yml => schemas/stable_diffusion/SDEngine.yml} (100%)
 delete mode 100644 tests/metagpt/tools/functions/__init__.py
 rename tests/metagpt/tools/{functions => }/libs/__init__.py (100%)
 rename tests/metagpt/tools/{functions => }/libs/test_data_preprocess.py (97%)
 rename tests/metagpt/tools/{functions => }/libs/test_feature_engineering.py (97%)
 rename tests/metagpt/tools/{functions => libs}/test_sd.py (93%)
 rename tests/metagpt/tools/{functions => libs}/test_udf.py (95%)

diff --git a/.gitignore b/.gitignore
index 87c7b3120..a69b3b1c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,6 +173,7 @@ tests/metagpt/utils/file_repo_git
 *.png
 htmlcov
 htmlcov.*
+cov.xml
 *.dot
 *.pkl
 *-structure.csv
diff --git a/docs/FAQ-EN.md b/docs/FAQ-EN.md
index d4a9f6097..145d27be9 100644
--- a/docs/FAQ-EN.md
+++ b/docs/FAQ-EN.md
@@ -130,7 +130,7 @@
         1.  HTML Layout： Outputs the HTML code for the page.
         1.  CSS Styles (styles.css)： Outputs the CSS code for the page.
 
-    1.  Currently, the SD skill is a tool invoked by UIDesign. It instantiates the SDEngine, with specific code found in metagpt/tools/sd_engine.
+    1.  Currently, the SD skill is a tool invoked by UIDesign. It instantiates the SDEngine, with specific code found in metagpt/tools/libs/sd_engine.py.
 
     1.  Configuration instructions for SD Skills: The SD interface is currently deployed based on *https://github.com/AUTOMATIC1111/stable-diffusion-webui* **For environmental configurations and model downloads, please refer to the aforementioned GitHub repository. To initiate the SD service that supports API calls, run the command specified in cmd with the parameter nowebui, i.e.,
 
diff --git a/metagpt/const.py b/metagpt/const.py
index a57464a19..7a19e81d0 100644
--- a/metagpt/const.py
+++ b/metagpt/const.py
@@ -70,8 +70,8 @@ TMP = METAGPT_ROOT / "tmp"
 SOURCE_ROOT = METAGPT_ROOT / "metagpt"
 PROMPT_PATH = SOURCE_ROOT / "prompts"
 SKILL_DIRECTORY = SOURCE_ROOT / "skills"
-TOOL_SCHEMA_PATH = METAGPT_ROOT / "metagpt/tools/functions/schemas"
-TOOL_LIBS_PATH = METAGPT_ROOT / "metagpt/tools/functions/libs"
+TOOL_SCHEMA_PATH = METAGPT_ROOT / "metagpt/tools/schemas"
+TOOL_LIBS_PATH = METAGPT_ROOT / "metagpt/tools/libs"
 
 
 # REAL CONSTS
diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py
index 31d754a9e..ff29d5ed4 100644
--- a/metagpt/prompts/ml_engineer.py
+++ b/metagpt/prompts/ml_engineer.py
@@ -15,7 +15,7 @@ Keep dataset column information updated before model train.
 # Task
 Update and print the dataset's column information only if the train or test data has changed. Use the following code:
 ```python
-from metagpt.tools.functions.libs.data_preprocess import get_column_info
+from metagpt.tools.libs.data_preprocess import get_column_info
 
 column_info = get_column_info(df)
 print("column_info")
@@ -248,7 +248,7 @@ when current task is "do data preprocess, like fill missing value, handle outlie
 ```python
 # Step 1: fill missing value
 # Tools used: ['FillMissingValue']
-from metagpt.tools.functions.libs.data_preprocess import FillMissingValue
+from metagpt.tools.libs.data_preprocess import FillMissingValue
 
 train_processed = train.copy()
 test_processed = test.copy()
diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py
index 4ca46fc89..23b51533d 100644
--- a/metagpt/tools/__init__.py
+++ b/metagpt/tools/__init__.py
@@ -8,7 +8,7 @@
 
 from enum import Enum
 from metagpt.tools import tool_types  # this registers all tool types
-from metagpt.tools.functions import libs  # this registers all tools
+from metagpt.tools import libs  # this registers all tools
 from metagpt.tools.tool_registry import TOOL_REGISTRY
 
 _ = tool_types  # Avoid pre-commit error
diff --git a/metagpt/tools/functions/__init__.py b/metagpt/tools/functions/__init__.py
deleted file mode 100644
index a0a43f507..000000000
--- a/metagpt/tools/functions/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Time    : 2023/11/16 16:32
-# @Author  : lidanyang
-# @File    : __init__.py
-# @Desc    :
diff --git a/metagpt/tools/functions/libs/base.py b/metagpt/tools/functions/libs/base.py
deleted file mode 100644
index c39adc66b..000000000
--- a/metagpt/tools/functions/libs/base.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Time    : 2023/12/10 20:12
-# @Author  : lidanyang
-# @File    : base
-# @Desc    :
-class MLProcess(object):
-    def fit(self, df):
-        raise NotImplementedError
-
-    def transform(self, df):
-        raise NotImplementedError
-
-    def fit_transform(self, df):
-        self.fit(df)
-        return self.transform(df)
diff --git a/metagpt/tools/functions/libs/udf/__init__.py b/metagpt/tools/functions/libs/udf/__init__.py
deleted file mode 100644
index 6644565d7..000000000
--- a/metagpt/tools/functions/libs/udf/__init__.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import ast
-import os
-import re
-import yaml
-import inspect
-import importlib
-from pathlib import Path
-from typing import List
-from metagpt.logs import logger
-
-
-def extract_function_signatures(file_path):
-    with open(file_path, "r", encoding="utf-8") as file:
-        source_code = file.read()
-
-    tree = ast.parse(source_code)
-    function_signatures = []
-    function_returns = []
-    for node in ast.walk(tree):
-        if isinstance(node, ast.FunctionDef):
-            # 只提取用户自定义函数，排除内置函数
-            if not (node.name.startswith("__") and node.name.endswith("__")):
-                # 获取函数名
-                function_name = node.name
-                # 获取参数列表
-                args = [arg.arg for arg in node.args.args]
-                # 获取函数签名
-                function_signature = f"{function_name}({', '.join(args)})"
-                # 导入函数
-                module_name = Path(file_path).parts[-1][: -len(Path(file_path).suffix)]
-                module = importlib.import_module(f"metagpt.tools.functions.libs.udf.{module_name}")
-                # 将函数导入到当前命名空间
-                globals().update({function_name: getattr(module, function_name)})
-                # 获取函数注释和函数路径
-                function_schema = {
-                    "udf_name": function_signature,
-                    "udf_path": f"from metagpt.tools.functions.libs.udf.{module_name} import {function_name}",
-                    "udf_doc": inspect.getdoc(getattr(module, function_name)),
-                }
-                function_signatures.append(function_schema)
-                # 获取函数返回变量名
-                source_lines, _ = inspect.getsourcelines(getattr(module, function_name))
-                for line in source_lines:
-                    if line.strip().startswith("return "):
-                        function_returns.append(
-                            {
-                                "udf_name": function_name,
-                                "udf_returns": [var.strip() for var in line.strip()[len("return ") :].split(",")],
-                            }
-                        )
-                        break
-
-                # 没有返回值的函数
-                if not function_returns or function_returns[-1]["udf_name"] != function_name:
-                    function_returns.append({"udf_name": function_name, "udf_returns": [None]})
-    return function_signatures, function_returns
-
-
-def get_function_signatures_in_folder(folder_path):
-    python_files = [f for f in os.listdir(folder_path) if f.endswith(".py") and f != "__init__.py"]
-    all_function_signatures = []
-    all_function_returns = []
-
-    for file_name in python_files:
-        file_path = os.path.join(folder_path, file_name)
-        function_signatures, function_returns = extract_function_signatures(file_path)
-        all_function_signatures.extend(function_signatures)
-        all_function_returns.extend(function_returns)
-    return all_function_signatures, all_function_returns
-
-
-# Create Tools Yaml Style Schema
-def docstring_to_yaml(docstring: str, return_vars: List[str] = None):
-    logger.debug(f"\n\nFunction Docstring: \n{'-'*60}\n {docstring} \n\nFunction Returns: \n{'-'*60}\n{return_vars}\n")
-    if docstring is None:
-        return {}
-    # 匹配简介部分
-    description_match = re.search(r"^(.*?)(?:Args:|Returns:|Raises:|$)", docstring, re.DOTALL)
-    description = description_match.group(1).strip() if description_match else ""
-
-    # 匹配Args部分
-    args_match = re.search(r"Args:\s*(.*?)(?:Returns:|Raises:|$)", docstring, re.DOTALL)
-    _args = args_match.group(1).strip() if args_match else ""
-    variable_pattern = re.compile(r"(\w+)\s*\((.*?)\):\s*(.*)")
-    params = variable_pattern.findall(_args)
-    if not params:
-        params = ((None, None, None),)
-    # 匹配Returns部分
-    returns_match = re.search(r"Returns:\s*(.*?)(?:Raises:|$)", docstring, re.DOTALL)
-    returns = returns_match.group(1).strip() if returns_match else ""
-    return_pattern = re.compile(r"^(.*)\s*:\s*(.*)$")
-    # 添加返回值变量名
-    return_vars = return_vars if isinstance(return_vars, list) else [return_vars]
-    returns = [(r, *r_desc) for r_desc, r in zip(return_pattern.findall(returns), return_vars)]
-    # 构建YAML字典
-    yaml_data = {
-        "description": description.strip(".").strip(),
-        "parameters": {
-            "properties": {
-                param[0]: {"type": param[1], "description": param[2]} for param in params if param[0] is not None
-            },
-            "required": [param[0] for param in params if param[0] is not None],
-        },
-        "returns": {ret[0]: {"type": ret[1], "description": ret[2]} for ret in returns},
-    }
-    return yaml_data
-
-
-def extract_function_schema_yaml_in_folder(folder_path: str):
-    function_signatures, function_returns = get_function_signatures_in_folder(folder_path)
-    function_schema_yaml_data = {}
-    for func_docstring, func_returns in zip(function_signatures, function_returns):
-        if func_docstring["udf_doc"]:
-            fun_yaml_data = docstring_to_yaml(func_docstring["udf_doc"], func_returns["udf_returns"])
-            fun_yaml_data.update({"type": "function"})
-            function_schema_yaml_data.update({func_returns["udf_name"]: fun_yaml_data})
-    return yaml.dump(function_schema_yaml_data, default_flow_style=False)
-
-
-folder_path = str(Path(__file__).parent.absolute())
-function_signatures, function_returns = get_function_signatures_in_folder(folder_path)
-
-UDFS = [func for func in function_signatures]
-
-UDFS_YAML_STR: str = extract_function_schema_yaml_in_folder(folder_path)
-UDFS_YAML: dict = yaml.load(UDFS_YAML_STR, Loader=yaml.FullLoader)
diff --git a/metagpt/tools/functions/schemas/data_preprocess.yml b/metagpt/tools/functions/schemas/data_preprocess.yml
deleted file mode 100644
index 4de697abd..000000000
--- a/metagpt/tools/functions/schemas/data_preprocess.yml
+++ /dev/null
@@ -1,306 +0,0 @@
-FillMissingValue:
-  type: class
-  description: "Completing missing values with simple strategies"
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          features:
-            type: list
-            description: "columns to be processed"
-          strategy:
-            type: str
-            description: "the imputation strategy, notice mean/median can only be used for numeric features"
-            default: mean
-            enum:
-              - mean
-              - median
-              - most_frequent
-              - constant
-          fill_value:
-            type: int
-            description: "fill_value is used to replace all occurrences of missing_values"
-            default: null
-        required:
-          - features
-    fit:
-      description: "Fit the FillMissingValue model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-MinMaxScale:
-  type: class
-  description: "Transform features by scaling each feature to a range, witch is (0, 1)"
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          features:
-            type: list
-            description: "columns to be processed"
-        required:
-          - features
-    fit:
-      description: "Fit the MinMaxScale model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-StandardScale:
-  type: class
-  description: "Standardize features by removing the mean and scaling to unit variance"
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          features:
-            type: list
-            description: "columns to be processed"
-        required:
-          - features
-    fit:
-      description: "Fit the StandardScale model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-MaxAbsScale:
-  type: class
-  description: "cale each feature by its maximum absolute value"
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          features:
-            type: list
-            description: "columns to be processed"
-        required:
-          - features
-    fit:
-      description: "Fit the MaxAbsScale model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-LabelEncode:
-  type: class
-  description: "Apply label encoding to specified categorical columns in-place."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          features:
-            type: list
-            description: "Categorical columns to be label encoded"
-        required:
-          - features
-    fit:
-      description: "Fit the LabelEncode model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-OneHotEncode:
-  type: class
-  description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          features:
-            type: list
-            description: "Categorical columns to be one-hot encoded and dropped"
-        required:
-          - features
-    fit:
-      description: "Fit the OneHotEncoding model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/feature_engineering.yml b/metagpt/tools/functions/schemas/feature_engineering.yml
deleted file mode 100644
index 62e6ad5b3..000000000
--- a/metagpt/tools/functions/schemas/feature_engineering.yml
+++ /dev/null
@@ -1,548 +0,0 @@
-PolynomialExpansion:
-  type: class
-  description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          cols:
-            type: list
-            description: "Columns for polynomial expansion."
-          label_col:
-            type: str
-            description: "Label column name."
-          degree:
-            type: int
-            description: "The degree of the polynomial features."
-            default: 2
-        required:
-          - cols
-          - label_col
-    fit:
-      description: "Fit the PolynomialExpansion model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame without duplicated columns."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame without duplicated columns."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-CatCount:
-  type: class
-  description: "Add value counts of a categorical column as new feature."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          col:
-            type: str
-            description: "Column for value counts."
-        required:
-          - col
-    fit:
-      description: "Fit the CatCount model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-TargetMeanEncoder:
-  type: class
-  description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          col:
-            type: str
-            description: "Column to be mean encoded."
-          label:
-            type: str
-            description: "Predicted label column."
-        required:
-          - col
-          - label
-    fit:
-      description: "Fit the TargetMeanEncoder model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-KFoldTargetMeanEncoder:
-  type: class
-  description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          col:
-            type: str
-            description: "Column to be k-fold mean encoded."
-          label:
-            type: str
-            description: "Predicted label column."
-          n_splits:
-            type: int
-            description: "Number of splits for K-fold."
-            default: 5
-          random_state:
-            type: int
-            description: "Random seed."
-            default: 2021
-        required:
-          - col
-          - label
-    fit:
-      description: "Fit the KFoldTargetMeanEncoder model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-CatCross:
-  type: class
-  description: "Add pairwise crossed features and convert them to numerical features."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          cols:
-            type: list
-            description: "Columns to be pairwise crossed, at least 2 columns."
-          max_cat_num:
-            type: int
-            description: "Maximum unique categories per crossed feature."
-            default: 100
-      required:
-        - cols
-    fit:
-      description: "Fit the CatCross model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-GroupStat:
-  type: class
-  description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          group_col:
-            type: str
-            description: "Column used for grouping."
-          agg_col:
-            type: str
-            description: "Column on which aggregation is performed."
-          agg_funcs:
-            type: list
-            description: >-
-              List of aggregation functions to apply, such as ['mean', 'std'].
-              Each function must be supported by pandas.
-        required:
-          - group_col
-          - agg_col
-          - agg_funcs
-    fit:
-      description: "Fit the GroupStat model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-SplitBins:
-  type: class
-  description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          cols:
-            type: list
-            description: "Columns to be binned inplace."
-          strategy:
-            type: str
-            description: "Strategy used to define the widths of the bins."
-            default: quantile
-            enum:
-              - quantile
-              - uniform
-              - kmeans
-        required:
-          - cols
-    fit:
-      description: "Fit the SplitBins model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-GeneralSelection:
-  type: class
-  description: "Drop all nan feats and feats with only one unique value."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          label_col:
-            type: str
-            description: "Label column name."
-        required:
-          - label_col
-    fit:
-      description: "Fit the GeneralSelection model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame."
-
-
-TreeBasedSelection:
-  type: class
-  description: "Select features based on tree-based model and remove features with low importance."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          label_col:
-            type: str
-            description: "Label column name."
-          task_type:
-            type: str
-            description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
-            enum:
-              - cls
-              - mcls
-              - reg
-        required:
-          - label_col
-          - task_type
-    fit:
-      description: "Fit the TreeBasedSelection model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame contain label_col."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame contain label_col."
-
-VarianceBasedSelection:
-  type: class
-  description: "Select features based on variance and remove features with low variance."
-  methods:
-    __init__:
-      description: "Initialize self."
-      parameters:
-        properties:
-          label_col:
-            type: str
-            description: "Label column name."
-          threshold:
-            type: float
-            description: "Threshold for variance."
-            default: 0.0
-        required:
-          - label_col
-    fit:
-      description: "Fit the VarianceBasedSelection model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-    transform:
-      description: "Transform the input DataFrame with the fitted model."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame contain label_col."
-    fit_transform:
-      description: "Fit and transform the input DataFrame."
-      parameters:
-        properties:
-          df:
-            type: DataFrame
-            description: "The input DataFrame."
-        required:
-          - df
-      returns:
-        df:
-          type: DataFrame
-          description: "The transformed DataFrame contain label_col."
\ No newline at end of file
diff --git a/metagpt/tools/functions/libs/__init__.py b/metagpt/tools/libs/__init__.py
similarity index 86%
rename from metagpt/tools/functions/libs/__init__.py
rename to metagpt/tools/libs/__init__.py
index f0a61a7d9..3d74674aa 100644
--- a/metagpt/tools/functions/libs/__init__.py
+++ b/metagpt/tools/libs/__init__.py
@@ -4,7 +4,7 @@
 # @Author  : lidanyang
 # @File    : __init__.py
 # @Desc    :
-from metagpt.tools.functions.libs import (
+from metagpt.tools.libs import (
     data_preprocess,
     feature_engineering,
 )
diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/libs/data_preprocess.py
similarity index 96%
rename from metagpt/tools/functions/libs/data_preprocess.py
rename to metagpt/tools/libs/data_preprocess.py
index 019ffd34e..7cc44263d 100644
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/libs/data_preprocess.py
@@ -13,13 +13,24 @@ from sklearn.preprocessing import (
     StandardScaler,
 )
 
-from metagpt.tools.functions.libs.base import MLProcess
 from metagpt.tools.tool_data_type import ToolTypeEnum
 from metagpt.tools.tool_registry import register_tool
 
 TOOL_TYPE = ToolTypeEnum.DATA_PREPROCESS.value
 
 
+class MLProcess(object):
+    def fit(self, df):
+        raise NotImplementedError
+
+    def transform(self, df):
+        raise NotImplementedError
+
+    def fit_transform(self, df):
+        self.fit(df)
+        return self.transform(df)
+
+
 @register_tool(tool_type_name=TOOL_TYPE)
 class FillMissingValue(MLProcess):
     def __init__(
diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/libs/feature_engineering.py
similarity index 99%
rename from metagpt/tools/functions/libs/feature_engineering.py
rename to metagpt/tools/libs/feature_engineering.py
index cd03592a6..ed5c1be72 100644
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/libs/feature_engineering.py
@@ -15,7 +15,7 @@ from sklearn.feature_selection import VarianceThreshold
 from sklearn.model_selection import KFold
 from sklearn.preprocessing import KBinsDiscretizer, PolynomialFeatures
 
-from metagpt.tools.functions.libs.base import MLProcess
+from metagpt.tools.libs.data_preprocess import MLProcess
 from metagpt.tools.tool_data_type import ToolTypeEnum
 from metagpt.tools.tool_registry import register_tool
 
diff --git a/metagpt/tools/sd_engine.py b/metagpt/tools/libs/sd_engine.py
similarity index 98%
rename from metagpt/tools/sd_engine.py
rename to metagpt/tools/libs/sd_engine.py
index 2e3f36ef8..ad63c2505 100644
--- a/metagpt/tools/sd_engine.py
+++ b/metagpt/tools/libs/sd_engine.py
@@ -53,7 +53,7 @@ payload = {
 default_negative_prompt = "(easynegative:0.8),black, dark,Low resolution"
 
 
-@register_tool(tool_type_name=ToolTypeEnum.STABLE_DIFFUSION)
+@register_tool(tool_type_name=ToolTypeEnum.STABLE_DIFFUSION.value)
 class SDEngine:
     def __init__(self, sd_url=""):
         # Initialize the SDEngine with configuration
diff --git a/metagpt/tools/functions/schemas/__init__.py b/metagpt/tools/schemas/__init__.py
similarity index 100%
rename from metagpt/tools/functions/schemas/__init__.py
rename to metagpt/tools/schemas/__init__.py
diff --git a/metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml b/metagpt/tools/schemas/data_preprocess/FillMissingValue.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/data_preprocess/FillMissingValue.yml
rename to metagpt/tools/schemas/data_preprocess/FillMissingValue.yml
diff --git a/metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml b/metagpt/tools/schemas/data_preprocess/LabelEncode.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/data_preprocess/LabelEncode.yml
rename to metagpt/tools/schemas/data_preprocess/LabelEncode.yml
diff --git a/metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml b/metagpt/tools/schemas/data_preprocess/MaxAbsScale.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/data_preprocess/MaxAbsScale.yml
rename to metagpt/tools/schemas/data_preprocess/MaxAbsScale.yml
diff --git a/metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml b/metagpt/tools/schemas/data_preprocess/MinMaxScale.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/data_preprocess/MinMaxScale.yml
rename to metagpt/tools/schemas/data_preprocess/MinMaxScale.yml
diff --git a/metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml b/metagpt/tools/schemas/data_preprocess/OneHotEncode.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/data_preprocess/OneHotEncode.yml
rename to metagpt/tools/schemas/data_preprocess/OneHotEncode.yml
diff --git a/metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml b/metagpt/tools/schemas/data_preprocess/StandardScale.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/data_preprocess/StandardScale.yml
rename to metagpt/tools/schemas/data_preprocess/StandardScale.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/CatCount.yml b/metagpt/tools/schemas/feature_engineering/CatCount.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/CatCount.yml
rename to metagpt/tools/schemas/feature_engineering/CatCount.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/CatCross.yml b/metagpt/tools/schemas/feature_engineering/CatCross.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/CatCross.yml
rename to metagpt/tools/schemas/feature_engineering/CatCross.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml b/metagpt/tools/schemas/feature_engineering/GeneralSelection.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/GeneralSelection.yml
rename to metagpt/tools/schemas/feature_engineering/GeneralSelection.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml b/metagpt/tools/schemas/feature_engineering/GroupStat.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/GroupStat.yml
rename to metagpt/tools/schemas/feature_engineering/GroupStat.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml b/metagpt/tools/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
rename to metagpt/tools/schemas/feature_engineering/KFoldTargetMeanEncoder.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml b/metagpt/tools/schemas/feature_engineering/PolynomialExpansion.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/PolynomialExpansion.yml
rename to metagpt/tools/schemas/feature_engineering/PolynomialExpansion.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml b/metagpt/tools/schemas/feature_engineering/SplitBins.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/SplitBins.yml
rename to metagpt/tools/schemas/feature_engineering/SplitBins.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml b/metagpt/tools/schemas/feature_engineering/TargetMeanEncoder.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/TargetMeanEncoder.yml
rename to metagpt/tools/schemas/feature_engineering/TargetMeanEncoder.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml b/metagpt/tools/schemas/feature_engineering/TreeBasedSelection.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/TreeBasedSelection.yml
rename to metagpt/tools/schemas/feature_engineering/TreeBasedSelection.yml
diff --git a/metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml b/metagpt/tools/schemas/feature_engineering/VarianceBasedSelection.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/feature_engineering/VarianceBasedSelection.yml
rename to metagpt/tools/schemas/feature_engineering/VarianceBasedSelection.yml
diff --git a/metagpt/tools/functions/schemas/stable_diffusion.yml b/metagpt/tools/schemas/stable_diffusion/SDEngine.yml
similarity index 100%
rename from metagpt/tools/functions/schemas/stable_diffusion.yml
rename to metagpt/tools/schemas/stable_diffusion/SDEngine.yml
diff --git a/tests/metagpt/tools/functions/__init__.py b/tests/metagpt/tools/functions/__init__.py
deleted file mode 100644
index 7d36f3404..000000000
--- a/tests/metagpt/tools/functions/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Time    : 2023/11/17 10:24
-# @Author  : lidanyang
-# @File    : __init__.py
-# @Desc    :
diff --git a/tests/metagpt/tools/functions/libs/__init__.py b/tests/metagpt/tools/libs/__init__.py
similarity index 100%
rename from tests/metagpt/tools/functions/libs/__init__.py
rename to tests/metagpt/tools/libs/__init__.py
diff --git a/tests/metagpt/tools/functions/libs/test_data_preprocess.py b/tests/metagpt/tools/libs/test_data_preprocess.py
similarity index 97%
rename from tests/metagpt/tools/functions/libs/test_data_preprocess.py
rename to tests/metagpt/tools/libs/test_data_preprocess.py
index 3c2d661ab..418f8adee 100644
--- a/tests/metagpt/tools/functions/libs/test_data_preprocess.py
+++ b/tests/metagpt/tools/libs/test_data_preprocess.py
@@ -5,7 +5,7 @@ import numpy.testing as npt
 import pandas as pd
 import pytest
 
-from metagpt.tools.functions.libs.data_preprocess import (
+from metagpt.tools.libs.data_preprocess import (
     FillMissingValue,
     LabelEncode,
     MaxAbsScale,
diff --git a/tests/metagpt/tools/functions/libs/test_feature_engineering.py b/tests/metagpt/tools/libs/test_feature_engineering.py
similarity index 97%
rename from tests/metagpt/tools/functions/libs/test_feature_engineering.py
rename to tests/metagpt/tools/libs/test_feature_engineering.py
index 5b45aeb0c..3cfd5dacd 100644
--- a/tests/metagpt/tools/functions/libs/test_feature_engineering.py
+++ b/tests/metagpt/tools/libs/test_feature_engineering.py
@@ -3,7 +3,7 @@ import pandas as pd
 import pytest
 from sklearn.datasets import fetch_california_housing, load_breast_cancer, load_iris
 
-from metagpt.tools.functions.libs.feature_engineering import (
+from metagpt.tools.libs.feature_engineering import (
     CatCount,
     CatCross,
     ExtractTimeComps,
@@ -147,6 +147,7 @@ def test_general_selection(mock_dataset):
     assert "cat2" not in transformed.columns
 
 
+@pytest.mark.skip  # skip because TreeBasedSelection needs lgb as dependency
 def test_tree_based_selection(mock_dataset):
     # regression
     data = load_sklearn_data("housing")
diff --git a/tests/metagpt/tools/functions/test_sd.py b/tests/metagpt/tools/libs/test_sd.py
similarity index 93%
rename from tests/metagpt/tools/functions/test_sd.py
rename to tests/metagpt/tools/libs/test_sd.py
index 142101cad..363cf96b9 100644
--- a/tests/metagpt/tools/functions/test_sd.py
+++ b/tests/metagpt/tools/libs/test_sd.py
@@ -4,7 +4,7 @@
 # @Desc    :
 import pytest
 
-from metagpt.tools.sd_engine import SDEngine
+from metagpt.tools.libs.sd_engine import SDEngine
 
 
 def test_sd_tools():
diff --git a/tests/metagpt/tools/functions/test_udf.py b/tests/metagpt/tools/libs/test_udf.py
similarity index 95%
rename from tests/metagpt/tools/functions/test_udf.py
rename to tests/metagpt/tools/libs/test_udf.py
index 741bd9a9f..19e523448 100644
--- a/tests/metagpt/tools/functions/test_udf.py
+++ b/tests/metagpt/tools/libs/test_udf.py
@@ -3,7 +3,7 @@ import json
 import yaml
 
 from metagpt.logs import logger
-from metagpt.tools.functions.libs.udf import UDFS, UDFS_YAML, docstring_to_yaml
+from metagpt.tools.libs.udf import UDFS, UDFS_YAML, docstring_to_yaml
 
 
 def test_udfs():

From 638dda31cf0c3d1b2fc3834174cd80b3c086abab Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Mon, 15 Jan 2024 11:58:07 +0800
Subject: [PATCH 04/12] add unit tests for tool registry

---
 metagpt/tools/tool_registry.py            |   3 +-
 tests/metagpt/tools/test_tool_registry.py | 101 ++++++++++++++++++++++
 2 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 tests/metagpt/tools/test_tool_registry.py

diff --git a/metagpt/tools/tool_registry.py b/metagpt/tools/tool_registry.py
index e6519bba9..2c59cd198 100644
--- a/metagpt/tools/tool_registry.py
+++ b/metagpt/tools/tool_registry.py
@@ -50,7 +50,8 @@ class ToolRegistry:
                 return
 
         with open(schema_path, "r", encoding="utf-8") as f:
-            schema = yaml.safe_load(f)[tool_name]
+            schema_dict = yaml.safe_load(f)
+            schema = schema_dict.get(tool_name) or dict(schema_dict.values())
         schema["tool_path"] = tool_path  # corresponding code file path of the tool
         try:
             ToolSchema(**schema)  # validation
diff --git a/tests/metagpt/tools/test_tool_registry.py b/tests/metagpt/tools/test_tool_registry.py
new file mode 100644
index 000000000..fd758b141
--- /dev/null
+++ b/tests/metagpt/tools/test_tool_registry.py
@@ -0,0 +1,101 @@
+import pytest
+
+from metagpt.tools.tool_registry import ToolRegistry
+from metagpt.tools.tool_types import ToolType
+
+
+@pytest.fixture
+def tool_registry():
+    return ToolRegistry()
+
+
+@pytest.fixture
+def schema_yaml(mocker):
+    mock_yaml_content = """
+    tool_name:
+        key1: value1
+        key2: value2
+    """
+    mocker.patch("os.path.exists", return_value=True)
+    mocker.patch("builtins.open", mocker.mock_open(read_data=mock_yaml_content))
+    return mocker
+
+
+# Test Initialization
+def test_initialization(tool_registry):
+    assert isinstance(tool_registry, ToolRegistry)
+    assert tool_registry.tools == {}
+    assert tool_registry.tool_types == {}
+    assert tool_registry.tools_by_types == {}
+
+
+# Test Tool Type Registration
+def test_register_tool_type(tool_registry):
+    tool_type = ToolType(name="TestType", desc="test")
+    tool_registry.register_tool_type(tool_type)
+    assert "TestType" in tool_registry.tool_types
+
+
+# Test Tool Registration
+def test_register_tool(tool_registry, schema_yaml):
+    tool_registry.register_tool("TestTool", "/path/to/tool")
+    assert "TestTool" in tool_registry.tools
+
+
+# Test Tool Registration with Non-existing Schema
+def test_register_tool_no_schema(tool_registry, mocker):
+    mocker.patch("os.path.exists", return_value=False)
+    tool_registry.register_tool("TestTool", "/path/to/tool")
+    assert "TestTool" not in tool_registry.tools
+
+
+# Test Tool Existence Checks
+def test_has_tool(tool_registry, schema_yaml):
+    tool_registry.register_tool("TestTool", "/path/to/tool")
+    assert tool_registry.has_tool("TestTool")
+    assert not tool_registry.has_tool("NonexistentTool")
+
+
+# Test Tool Retrieval
+def test_get_tool(tool_registry, schema_yaml):
+    tool_registry.register_tool("TestTool", "/path/to/tool")
+    tool = tool_registry.get_tool("TestTool")
+    assert tool is not None
+    assert tool.name == "TestTool"
+    assert tool.path == "/path/to/tool"
+
+
+# Similar tests for has_tool_type, get_tool_type, get_tools_by_type
+def test_has_tool_type(tool_registry):
+    tool_type = ToolType(name="TestType", desc="test")
+    tool_registry.register_tool_type(tool_type)
+    assert tool_registry.has_tool_type("TestType")
+    assert not tool_registry.has_tool_type("NonexistentType")
+
+
+def test_get_tool_type(tool_registry):
+    tool_type = ToolType(name="TestType", desc="test")
+    tool_registry.register_tool_type(tool_type)
+    retrieved_type = tool_registry.get_tool_type("TestType")
+    assert retrieved_type is not None
+    assert retrieved_type.name == "TestType"
+
+
+def test_get_tools_by_type(tool_registry, schema_yaml):
+    tool_type_name = "TestType"
+    tool_name = "TestTool"
+    tool_path = "/path/to/tool"
+    tool_type = ToolType(name=tool_type_name, desc="test")
+    tool_registry.register_tool_type(tool_type)
+
+    tool_registry.register_tool(tool_name, tool_path, tool_type_name=tool_type_name)
+
+    tools_by_type = tool_registry.get_tools_by_type(tool_type_name)
+    assert tools_by_type is not None
+    assert tool_name in tools_by_type
+
+
+# Test case for when the tool type does not exist
+def test_get_tools_by_nonexistent_type(tool_registry):
+    tools_by_type = tool_registry.get_tools_by_type("NonexistentType")
+    assert tools_by_type is None

From 8a14dde219f8ec03531c21f0f62c75bcc680ae60 Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Tue, 16 Jan 2024 15:46:13 +0800
Subject: [PATCH 05/12] tool_type renaming

---
 metagpt/prompts/{tool_type.py => tool_types.py} | 0
 metagpt/roles/code_interpreter.py               | 7 +++++++
 metagpt/tools/tool_types.py                     | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)
 rename metagpt/prompts/{tool_type.py => tool_types.py} (100%)

diff --git a/metagpt/prompts/tool_type.py b/metagpt/prompts/tool_types.py
similarity index 100%
rename from metagpt/prompts/tool_type.py
rename to metagpt/prompts/tool_types.py
diff --git a/metagpt/roles/code_interpreter.py b/metagpt/roles/code_interpreter.py
index afd51a575..46cc00d5e 100644
--- a/metagpt/roles/code_interpreter.py
+++ b/metagpt/roles/code_interpreter.py
@@ -5,6 +5,7 @@ from pydantic import Field
 from metagpt.actions.ask_review import ReviewConst
 from metagpt.actions.execute_code import ExecutePyCode
 from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
+from metagpt.actions.write_code_steps import WriteCodeSteps
 from metagpt.logs import logger
 from metagpt.roles import Role
 from metagpt.roles.tool_maker import ToolMaker
@@ -16,6 +17,7 @@ class CodeInterpreter(Role):
     auto_run: bool = True
     use_tools: bool = False
     make_udfs: bool = False  # whether to save user-defined functions
+    use_code_steps: bool = False
     execute_code: ExecutePyCode = Field(default_factory=ExecutePyCode, exclude=True)
 
     def __init__(
@@ -56,6 +58,10 @@ class CodeInterpreter(Role):
         return task_result
 
     async def _write_and_exec_code(self, max_retry: int = 3):
+        self.planner.current_task.code_steps = (
+            await WriteCodeSteps().run(self.planner.plan) if self.use_code_steps else ""
+        )
+
         counter = 0
         success = False
 
@@ -90,6 +96,7 @@ class CodeInterpreter(Role):
         logger.info(f"ready to {todo.name}")
 
         context = self.planner.get_useful_memories()
+        # print(*context, sep="\n***\n")
         code = await todo.run(context=context, plan=self.planner.plan, temperature=0.0)
         # 暂时在这里转换 WriteCodeWithTools 的输出
         if isinstance(code, str):
diff --git a/metagpt/tools/tool_types.py b/metagpt/tools/tool_types.py
index 97eb574da..289271985 100644
--- a/metagpt/tools/tool_types.py
+++ b/metagpt/tools/tool_types.py
@@ -1,4 +1,4 @@
-from metagpt.prompts.tool_type import (
+from metagpt.prompts.tool_types import (
     DATA_PREPROCESS_PROMPT,
     FEATURE_ENGINEERING_PROMPT,
     MODEL_EVALUATE_PROMPT,

From c8858cd8d464ef2c477770f927310e1a84cc7b3c Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Tue, 16 Jan 2024 17:54:38 +0800
Subject: [PATCH 06/12] minimize ml_engineer

---
 metagpt/actions/ml_da_action.py        |   2 +-
 metagpt/actions/write_analysis_code.py |  23 +++--
 metagpt/prompts/ml_engineer.py         |   8 +-
 metagpt/roles/ml_engineer.py           | 111 +++++++------------------
 metagpt/tools/tool_data_type.py        |   1 +
 metagpt/tools/tool_types.py            |   6 ++
 6 files changed, 51 insertions(+), 100 deletions(-)

diff --git a/metagpt/actions/ml_da_action.py b/metagpt/actions/ml_da_action.py
index d4e77773f..584c4db7a 100644
--- a/metagpt/actions/ml_da_action.py
+++ b/metagpt/actions/ml_da_action.py
@@ -63,4 +63,4 @@ class UpdateDataColumns(Action):
         prompt = UPDATE_DATA_COLUMNS.format(history_code=code_context)
         tool_config = create_func_config(PRINT_DATA_COLUMNS)
         rsp = await self.llm.aask_code(prompt, **tool_config)
-        return rsp
+        return rsp["code"]
diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py
index f4ae1e572..efd1ea163 100644
--- a/metagpt/actions/write_analysis_code.py
+++ b/metagpt/actions/write_analysis_code.py
@@ -155,10 +155,6 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
         )
         code_steps = plan.current_task.code_steps
 
-        finished_tasks = plan.get_finished_tasks()
-        code_context = [remove_comments(task.code) for task in finished_tasks]
-        code_context = "\n\n".join(code_context)
-
         tool_catalog = {}
 
         if available_tools:
@@ -189,26 +185,28 @@ class WriteCodeWithToolsML(WriteCodeWithTools):
         column_info: str = "",
         **kwargs,
     ) -> Tuple[List[Message], str]:
-        tool_type = plan.current_task.task_type
-        available_tools = self.available_tools.get(tool_type, {})
-        special_prompt = TOOL_TYPE_USAGE_PROMPT.get(tool_type, "")
+        tool_type = (
+            plan.current_task.task_type
+        )  # find tool type from task type through exact match, can extend to retrieval in the future
+        available_tools = TOOL_REGISTRY.get_tools_by_type(tool_type)
+        special_prompt = (
+            TOOL_REGISTRY.get_tool_type(tool_type).usage_prompt if TOOL_REGISTRY.has_tool_type(tool_type) else ""
+        )
         code_steps = plan.current_task.code_steps
 
         finished_tasks = plan.get_finished_tasks()
         code_context = [remove_comments(task.code) for task in finished_tasks]
         code_context = "\n\n".join(code_context)
 
-        if len(available_tools) > 0:
-            available_tools = {k: v["description"] for k, v in available_tools.items()}
+        if available_tools:
+            available_tools = {tool_name: tool.schema["description"] for tool_name, tool in available_tools.items()}
 
             recommend_tools = await self._tool_recommendation(
                 plan.current_task.instruction, code_steps, available_tools
             )
-            tool_catalog = self._parse_recommend_tools(tool_type, recommend_tools)
+            tool_catalog = self._parse_recommend_tools(recommend_tools)
             logger.info(f"Recommended tools: \n{recommend_tools}")
 
-            module_name = TOOL_TYPE_MODULE[tool_type]
-
             prompt = ML_TOOL_USAGE_PROMPT.format(
                 user_requirement=plan.goal,
                 history_code=code_context,
@@ -216,7 +214,6 @@ class WriteCodeWithToolsML(WriteCodeWithTools):
                 column_info=column_info,
                 special_prompt=special_prompt,
                 code_steps=code_steps,
-                module_name=module_name,
                 tool_catalog=tool_catalog,
             )
 
diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py
index ff29d5ed4..3fd895e6e 100644
--- a/metagpt/prompts/ml_engineer.py
+++ b/metagpt/prompts/ml_engineer.py
@@ -134,16 +134,12 @@ PRINT_DATA_COLUMNS = {
     "parameters": {
         "type": "object",
         "properties": {
-            "is_update": {
-                "type": "boolean",
-                "description": "Whether need to update the column info.",
-            },
             "code": {
                 "type": "string",
                 "description": "The code to be added to a new cell in jupyter.",
             },
         },
-        "required": ["is_update", "code"],
+        "required": ["code"],
     },
 }
 
@@ -240,7 +236,7 @@ Strictly follow steps below when you writing code if it's convenient.
 - You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..
 
 # Available Tools:
-Each Class tool is described in JSON format. When you call a tool, import the tool from `{module_name}` first.
+Each Class tool is described in JSON format. When you call a tool, import the tool from its path first.
 {tool_catalog}
 
 # Output Example:
diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py
index a60642bff..aeea39c0c 100644
--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@@ -1,64 +1,43 @@
-from metagpt.actions.ask_review import ReviewConst
 from metagpt.actions.debug_code import DebugCode
 from metagpt.actions.execute_code import ExecutePyCode
-from metagpt.actions.ml_da_action import Reflect, SummarizeAnalysis, UpdateDataColumns
+from metagpt.actions.ml_da_action import UpdateDataColumns
 from metagpt.actions.write_analysis_code import WriteCodeWithToolsML
-from metagpt.actions.write_code_steps import WriteCodeSteps
 from metagpt.logs import logger
 from metagpt.roles.code_interpreter import CodeInterpreter
-from metagpt.roles.kaggle_manager import DownloadData, SubmitResult
-from metagpt.schema import Message
+from metagpt.tools.tool_data_type import ToolTypeEnum
 from metagpt.utils.common import any_to_str
 
 
 class MLEngineer(CodeInterpreter):
-    use_code_steps: bool = False
-    use_udfs: bool = False
-    data_desc: dict = {}
     debug_context: list = []
     latest_code: str = ""
 
     def __init__(self, name="Mark", profile="MLEngineer", **kwargs):
         super().__init__(name=name, profile=profile, **kwargs)
-        # self._watch([DownloadData, SubmitResult])  # in multi-agent settings
-
-    async def _plan_and_act(self):
-        ### a new attempt on the data, relevant in a multi-agent multi-turn setting ###
-        await self._prepare_data_context()
-
-        ### general plan process ###
-        await super()._plan_and_act()
-
-        ### summarize analysis ###
-        summary = await SummarizeAnalysis().run(self.planner.plan)
-        rsp = Message(content=summary, cause_by=SummarizeAnalysis)
-        self.rc.memory.add(rsp)
-
-        return rsp
-
-    async def _write_and_exec_code(self, max_retry: int = 3):
-        self.planner.current_task.code_steps = (
-            await WriteCodeSteps().run(self.planner.plan) if self.use_code_steps else ""
-        )
-
-        code, result, success = await super()._write_and_exec_code(max_retry=max_retry)
-
-        if success:
-            if self.use_tools and self.planner.current_task.task_type in ["data_preprocess", "feature_engineering"]:
-                update_success, new_code = await self._update_data_columns()
-                if update_success:
-                    code = code + "\n\n" + new_code
-
-        return code, result, success
 
     async def _write_code(self):
         if not self.use_tools:
             return await super()._write_code()
 
-        code_execution_count = sum([msg.cause_by == any_to_str(ExecutePyCode) for msg in self.working_memory.get()])
+        # In a trial and errors settings, check whether this is our first attempt to tackle the task. If there is no code execution before, then it is.
+        is_first_trial = any_to_str(ExecutePyCode) not in [msg.cause_by for msg in self.working_memory.get()]
 
-        if code_execution_count > 0:
-            logger.warning("We got a bug code, now start to debug...")
+        if is_first_trial:
+            # For the first trial, write task code from scratch
+            column_info = await self._update_data_columns()
+
+            logger.info("Write code with tools")
+            tool_context, code = await WriteCodeWithToolsML().run(
+                context=[],  # context assembled inside the Action
+                plan=self.planner.plan,
+                column_info=column_info,
+            )
+            self.debug_context = tool_context
+            cause_by = WriteCodeWithToolsML
+
+        else:
+            # Previous trials resulted in error, debug and rewrite the code
+            logger.warning("We got a bug, now start to debug...")
             code = await DebugCode().run(
                 code=self.latest_code,
                 runtime_result=self.working_memory.get(),
@@ -67,49 +46,21 @@ class MLEngineer(CodeInterpreter):
             logger.info(f"new code \n{code}")
             cause_by = DebugCode
 
-        else:
-            logger.info("Write code with tools")
-            tool_context, code = await WriteCodeWithToolsML().run(
-                context=[],  # context assembled inside the Action
-                plan=self.planner.plan,
-                column_info=self.data_desc.get("column_info", ""),
-            )
-            self.debug_context = tool_context
-            cause_by = WriteCodeWithToolsML
-
         self.latest_code = code
 
         return code, cause_by
 
     async def _update_data_columns(self):
+        current_task = self.planner.plan.current_task
+        if current_task.task_type not in [
+            ToolTypeEnum.DATA_PREPROCESS.value,
+            ToolTypeEnum.FEATURE_ENGINEERING.value,
+            ToolTypeEnum.MODEL_TRAIN.value,
+        ]:
+            return ""
         logger.info("Check columns in updated data")
-        rsp = await UpdateDataColumns().run(self.planner.plan)
-        is_update, code = rsp["is_update"], rsp["code"]
+        code = await UpdateDataColumns().run(self.planner.plan)
         success = False
-        if is_update:
-            result, success = await self.execute_code.run(code)
-            if success:
-                print(result)
-                self.data_desc["column_info"] = result
-        return success, code
-
-    async def _prepare_data_context(self):
-        memories = self.get_memories()
-        if memories:
-            latest_event = memories[-1].cause_by
-            if latest_event == DownloadData:
-                self.planner.plan.context = memories[-1].content
-            elif latest_event == SubmitResult:
-                # self reflect on previous plan outcomes and think about how to improve the plan, add to working  memory
-                await self._reflect()
-
-                # get feedback for improvement from human, add to working memory
-                await self.planner.ask_review(trigger=ReviewConst.TASK_REVIEW_TRIGGER)
-
-    async def _reflect(self):
-        context = self.get_memories()
-        context = "\n".join([str(msg) for msg in context])
-
-        reflection = await Reflect().run(context=context)
-        self.working_memory.add(Message(content=reflection, role="assistant"))
-        self.working_memory.add(Message(content=Reflect.REWRITE_PLAN_INSTRUCTION, role="user"))
+        result, success = await self.execute_code.run(code)
+        print(result)
+        return result if success else ""
diff --git a/metagpt/tools/tool_data_type.py b/metagpt/tools/tool_data_type.py
index c767fef9b..a3ab20a4e 100644
--- a/metagpt/tools/tool_data_type.py
+++ b/metagpt/tools/tool_data_type.py
@@ -4,6 +4,7 @@ from pydantic import BaseModel
 
 
 class ToolTypeEnum(Enum):
+    EDA = "eda"
     DATA_PREPROCESS = "data_preprocess"
     FEATURE_ENGINEERING = "feature_engineering"
     MODEL_TRAIN = "model_train"
diff --git a/metagpt/tools/tool_types.py b/metagpt/tools/tool_types.py
index 289271985..2e22adc40 100644
--- a/metagpt/tools/tool_types.py
+++ b/metagpt/tools/tool_types.py
@@ -8,6 +8,12 @@ from metagpt.tools.tool_data_type import ToolType, ToolTypeEnum
 from metagpt.tools.tool_registry import register_tool_type
 
 
+@register_tool_type
+class EDA(ToolType):
+    name: str = ToolTypeEnum.EDA.value
+    desc: str = "Useful for performing exploratory data analysis"
+
+
 @register_tool_type
 class DataPreprocess(ToolType):
     name: str = ToolTypeEnum.DATA_PREPROCESS.value

From 9dc421b1229bc88fb9b5f2c8307fd98b16874ab5 Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Tue, 16 Jan 2024 19:18:03 +0800
Subject: [PATCH 07/12] rename schema to schemas to avoid pydantic warning

---
 metagpt/actions/write_analysis_code.py |  6 +++---
 metagpt/tools/tool_data_type.py        |  2 +-
 metagpt/tools/tool_registry.py         | 10 +++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py
index efd1ea163..65be198ef 100644
--- a/metagpt/actions/write_analysis_code.py
+++ b/metagpt/actions/write_analysis_code.py
@@ -110,7 +110,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
             if TOOL_REGISTRY.has_tool(tool_name):
                 valid_tools.append(TOOL_REGISTRY.get_tool(tool_name))
 
-        tool_catalog = {tool.name: tool.schema for tool in valid_tools}
+        tool_catalog = {tool.name: tool.schemas for tool in valid_tools}
         return tool_catalog
 
     async def _tool_recommendation(
@@ -158,7 +158,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
         tool_catalog = {}
 
         if available_tools:
-            available_tools = {tool_name: tool.schema["description"] for tool_name, tool in available_tools.items()}
+            available_tools = {tool_name: tool.schemas["description"] for tool_name, tool in available_tools.items()}
 
             recommend_tools = await self._tool_recommendation(
                 plan.current_task.instruction, code_steps, available_tools
@@ -199,7 +199,7 @@ class WriteCodeWithToolsML(WriteCodeWithTools):
         code_context = "\n\n".join(code_context)
 
         if available_tools:
-            available_tools = {tool_name: tool.schema["description"] for tool_name, tool in available_tools.items()}
+            available_tools = {tool_name: tool.schemas["description"] for tool_name, tool in available_tools.items()}
 
             recommend_tools = await self._tool_recommendation(
                 plan.current_task.instruction, code_steps, available_tools
diff --git a/metagpt/tools/tool_data_type.py b/metagpt/tools/tool_data_type.py
index a3ab20a4e..8206afa59 100644
--- a/metagpt/tools/tool_data_type.py
+++ b/metagpt/tools/tool_data_type.py
@@ -29,5 +29,5 @@ class ToolSchema(BaseModel):
 class Tool(BaseModel):
     name: str
     path: str
-    schema: dict = {}
+    schemas: dict = {}
     code: str = ""
diff --git a/metagpt/tools/tool_registry.py b/metagpt/tools/tool_registry.py
index 2c59cd198..5d743358c 100644
--- a/metagpt/tools/tool_registry.py
+++ b/metagpt/tools/tool_registry.py
@@ -25,7 +25,7 @@ class ToolRegistry:
 
     def register_tool_type(self, tool_type: ToolType):
         self.tool_types[tool_type.name] = tool_type
-        logger.info(f"{tool_type.name} registered")
+        logger.info(f"tool type {tool_type.name} registered")
 
     def register_tool(
         self,
@@ -51,16 +51,16 @@ class ToolRegistry:
 
         with open(schema_path, "r", encoding="utf-8") as f:
             schema_dict = yaml.safe_load(f)
-            schema = schema_dict.get(tool_name) or dict(schema_dict.values())
-        schema["tool_path"] = tool_path  # corresponding code file path of the tool
+            schemas = schema_dict.get(tool_name) or dict(schema_dict.values())
+        schemas["tool_path"] = tool_path  # corresponding code file path of the tool
         try:
-            ToolSchema(**schema)  # validation
+            ToolSchema(**schemas)  # validation
         except Exception:
             pass
             # logger.warning(
             #     f"{tool_name} schema not conforms to required format, but will be used anyway. Mismatch: {e}"
             # )
-        tool = Tool(name=tool_name, path=tool_path, schema=schema, code=tool_code)
+        tool = Tool(name=tool_name, path=tool_path, schemas=schemas, code=tool_code)
         self.tools[tool_name] = tool
         self.tools_by_types[tool_type_name][tool_name] = tool
         logger.info(f"{tool_name} registered")

From 1cabf2c503f2de5c037049af78923ad2faa2be4a Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Thu, 18 Jan 2024 20:34:32 +0800
Subject: [PATCH 08/12] change register arg name, integrate image2web tool

---
 metagpt/prompts/tool_types.py                 |  4 +-
 metagpt/tools/__init__.py                     |  4 +-
 metagpt/tools/libs/__init__.py                |  5 +-
 metagpt/tools/libs/data_preprocess.py         | 18 +++----
 metagpt/tools/libs/feature_engineering.py     | 20 ++++----
 .../vision.py => libs/gpt_v_generator.py}     | 34 ++++++-------
 metagpt/tools/libs/sd_engine.py               |  5 +-
 .../image2webpage/GPTvGenerator.yml}          |  2 +-
 metagpt/tools/tool_data_type.py               |  1 +
 metagpt/tools/tool_registry.py                | 12 ++---
 metagpt/tools/tool_types.py                   |  8 ++++
 .../tools/functions/libs/test_vision.py       | 48 -------------------
 .../tools/libs/test_gpt_v_generator.py        | 40 ++++++++++++++++
 .../libs/{test_sd.py => test_sd_engine.py}    |  0
 tests/metagpt/tools/test_tool_registry.py     |  2 +-
 15 files changed, 100 insertions(+), 103 deletions(-)
 rename metagpt/tools/{functions/libs/vision.py => libs/gpt_v_generator.py} (85%)
 rename metagpt/tools/{functions/schemas/vision.yml => schemas/image2webpage/GPTvGenerator.yml} (93%)
 delete mode 100644 tests/metagpt/tools/functions/libs/test_vision.py
 create mode 100644 tests/metagpt/tools/libs/test_gpt_v_generator.py
 rename tests/metagpt/tools/libs/{test_sd.py => test_sd_engine.py} (100%)

diff --git a/metagpt/prompts/tool_types.py b/metagpt/prompts/tool_types.py
index 43ead78a6..c01a80310 100644
--- a/metagpt/prompts/tool_types.py
+++ b/metagpt/prompts/tool_types.py
@@ -39,7 +39,7 @@ The current task is about evaluating a model, please note the following:
 """
 
 # Prompt for using tools of "vision" type
-VISION_PROMPT = """
+IMAGE2WEBPAGE_PROMPT = """
 The current task is about converting image into webpage code. please note the following:
 - Single-Step Code Generation: Execute the entire code generation process in a single step, encompassing HTML, CSS, and JavaScript. Avoid fragmenting the code generation into multiple separate steps to maintain consistency and simplify the development workflow.
-"""
\ No newline at end of file
+"""
diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py
index 23b51533d..f18d1d276 100644
--- a/metagpt/tools/__init__.py
+++ b/metagpt/tools/__init__.py
@@ -11,9 +11,7 @@ from metagpt.tools import tool_types  # this registers all tool types
 from metagpt.tools import libs  # this registers all tools
 from metagpt.tools.tool_registry import TOOL_REGISTRY
 
-_ = tool_types  # Avoid pre-commit error
-_ = libs  # Avoid pre-commit error
-_ = TOOL_REGISTRY  # Avoid pre-commit error
+_, _, _ = tool_types, libs, TOOL_REGISTRY  # Avoid pre-commit error
 
 
 class SearchEngineType(Enum):
diff --git a/metagpt/tools/libs/__init__.py b/metagpt/tools/libs/__init__.py
index 3d74674aa..b576997c9 100644
--- a/metagpt/tools/libs/__init__.py
+++ b/metagpt/tools/libs/__init__.py
@@ -7,7 +7,8 @@
 from metagpt.tools.libs import (
     data_preprocess,
     feature_engineering,
+    sd_engine,
+    gpt_v_generator,
 )
 
-_ = data_preprocess  # Avoid pre-commit error
-_ = feature_engineering  # Avoid pre-commit error
+_, _, _, _ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator  # Avoid pre-commit error
diff --git a/metagpt/tools/libs/data_preprocess.py b/metagpt/tools/libs/data_preprocess.py
index 7cc44263d..3891f9df0 100644
--- a/metagpt/tools/libs/data_preprocess.py
+++ b/metagpt/tools/libs/data_preprocess.py
@@ -31,7 +31,7 @@ class MLProcess(object):
         return self.transform(df)
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class FillMissingValue(MLProcess):
     def __init__(
         self,
@@ -58,7 +58,7 @@ class FillMissingValue(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class MinMaxScale(MLProcess):
     def __init__(
         self,
@@ -77,7 +77,7 @@ class MinMaxScale(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class StandardScale(MLProcess):
     def __init__(
         self,
@@ -96,7 +96,7 @@ class StandardScale(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class MaxAbsScale(MLProcess):
     def __init__(
         self,
@@ -115,7 +115,7 @@ class MaxAbsScale(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class RobustScale(MLProcess):
     def __init__(
         self,
@@ -134,7 +134,7 @@ class RobustScale(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class OrdinalEncode(MLProcess):
     def __init__(
         self,
@@ -153,7 +153,7 @@ class OrdinalEncode(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class OneHotEncode(MLProcess):
     def __init__(
         self,
@@ -175,7 +175,7 @@ class OneHotEncode(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class LabelEncode(MLProcess):
     def __init__(
         self,
@@ -204,7 +204,7 @@ class LabelEncode(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 def get_column_info(df: pd.DataFrame) -> dict:
     column_info = {
         "Category": [],
diff --git a/metagpt/tools/libs/feature_engineering.py b/metagpt/tools/libs/feature_engineering.py
index ed5c1be72..308150f9b 100644
--- a/metagpt/tools/libs/feature_engineering.py
+++ b/metagpt/tools/libs/feature_engineering.py
@@ -22,7 +22,7 @@ from metagpt.tools.tool_registry import register_tool
 TOOL_TYPE = ToolTypeEnum.FEATURE_ENGINEERING.value
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class PolynomialExpansion(MLProcess):
     def __init__(self, cols: list, degree: int = 2, label_col: str = None):
         self.cols = cols
@@ -53,7 +53,7 @@ class PolynomialExpansion(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class CatCount(MLProcess):
     def __init__(self, col: str):
         self.col = col
@@ -68,7 +68,7 @@ class CatCount(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class TargetMeanEncoder(MLProcess):
     def __init__(self, col: str, label: str):
         self.col = col
@@ -84,7 +84,7 @@ class TargetMeanEncoder(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class KFoldTargetMeanEncoder(MLProcess):
     def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
         self.col = col
@@ -111,7 +111,7 @@ class KFoldTargetMeanEncoder(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class CatCross(MLProcess):
     def __init__(self, cols: list, max_cat_num: int = 100):
         self.cols = cols
@@ -147,7 +147,7 @@ class CatCross(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class GroupStat(MLProcess):
     def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
         self.group_col = group_col
@@ -167,7 +167,7 @@ class GroupStat(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class SplitBins(MLProcess):
     def __init__(self, cols: list, strategy: str = "quantile"):
         self.cols = cols
@@ -184,7 +184,7 @@ class SplitBins(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class ExtractTimeComps(MLProcess):
     def __init__(self, time_col: str, time_comps: list):
         self.time_col = time_col
@@ -213,7 +213,7 @@ class ExtractTimeComps(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class GeneralSelection(MLProcess):
     def __init__(self, label_col: str):
         self.label_col = label_col
@@ -284,7 +284,7 @@ class TreeBasedSelection(MLProcess):
         return new_df
 
 
-@register_tool(tool_type_name=TOOL_TYPE)
+@register_tool(tool_type=TOOL_TYPE)
 class VarianceBasedSelection(MLProcess):
     def __init__(self, label_col: str, threshold: float = 0):
         self.label_col = label_col
diff --git a/metagpt/tools/functions/libs/vision.py b/metagpt/tools/libs/gpt_v_generator.py
similarity index 85%
rename from metagpt/tools/functions/libs/vision.py
rename to metagpt/tools/libs/gpt_v_generator.py
index b10ad7608..58e547840 100644
--- a/metagpt/tools/functions/libs/vision.py
+++ b/metagpt/tools/libs/gpt_v_generator.py
@@ -5,18 +5,13 @@
 @Author  : mannaandpoem
 @File    : vision.py
 """
+import base64
 from pathlib import Path
 
 import requests
 
-import base64
-
-from metagpt.config import CONFIG
-
-OPENAI_API_BASE = CONFIG.OPENAI_BASE_URL
-API_KEY = CONFIG.OPENAI_API_KEY
-MODEL = CONFIG.OPENAI_VISION_MODEL
-MAX_TOKENS = CONFIG.VISION_MAX_TOKENS
+from metagpt.tools.tool_data_type import ToolTypeEnum
+from metagpt.tools.tool_registry import register_tool
 
 ANALYZE_LAYOUT_PROMPT = """You are now a UI/UX, please generate layout information for this image:
 
@@ -33,8 +28,15 @@ As the design pays tribute to large companies, sometimes it is normal for some c
 Now, please generate the corresponding webpage code including HTML, CSS and JavaScript:"""
 
 
-class Vision:
+@register_tool(tool_type=ToolTypeEnum.IMAGE2WEBPAGE.value)
+class GPTvGenerator:
     def __init__(self):
+        from metagpt.config import CONFIG
+
+        OPENAI_API_BASE = CONFIG.OPENAI_BASE_URL
+        API_KEY = CONFIG.OPENAI_API_KEY
+        MODEL = CONFIG.OPENAI_VISION_MODEL
+        MAX_TOKENS = CONFIG.VISION_MAX_TOKENS
         self.api_key = API_KEY
         self.api_base = OPENAI_API_BASE
         self.model = MODEL
@@ -51,10 +53,7 @@ class Vision:
 
     def get_result(self, image_path, prompt):
         base64_image = self.encode_image(image_path)
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {self.api_key}"
-        }
+        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
         payload = {
             "model": self.model,
             "messages": [
@@ -62,11 +61,8 @@ class Vision:
                     "role": "user",
                     "content": [
                         {"type": "text", "text": prompt},
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
-                        }
-                    ]
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
+                    ],
                 }
             ],
             "max_tokens": self.max_tokens,
@@ -81,7 +77,7 @@ class Vision:
     @staticmethod
     def encode_image(image_path):
         with open(image_path, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode('utf-8')
+            return base64.b64encode(image_file.read()).decode("utf-8")
 
     @staticmethod
     def save_webpages(image_path, webpages) -> Path:
diff --git a/metagpt/tools/libs/sd_engine.py b/metagpt/tools/libs/sd_engine.py
index ad63c2505..794758f77 100644
--- a/metagpt/tools/libs/sd_engine.py
+++ b/metagpt/tools/libs/sd_engine.py
@@ -13,7 +13,6 @@ import requests
 from aiohttp import ClientSession
 from PIL import Image, PngImagePlugin
 
-from metagpt.config import CONFIG
 from metagpt.const import SD_OUTPUT_FILE_REPO
 from metagpt.logs import logger
 from metagpt.tools.tool_data_type import ToolTypeEnum
@@ -53,9 +52,11 @@ payload = {
 default_negative_prompt = "(easynegative:0.8),black, dark,Low resolution"
 
 
-@register_tool(tool_type_name=ToolTypeEnum.STABLE_DIFFUSION.value)
+@register_tool(tool_type=ToolTypeEnum.STABLE_DIFFUSION.value)
 class SDEngine:
     def __init__(self, sd_url=""):
+        from metagpt.config import CONFIG
+
         # Initialize the SDEngine with configuration
         self.sd_url = sd_url if sd_url else CONFIG.get("SD_URL")
         self.sd_t2i_url = f"{self.sd_url}{CONFIG.get('SD_T2I_API')}"
diff --git a/metagpt/tools/functions/schemas/vision.yml b/metagpt/tools/schemas/image2webpage/GPTvGenerator.yml
similarity index 93%
rename from metagpt/tools/functions/schemas/vision.yml
rename to metagpt/tools/schemas/image2webpage/GPTvGenerator.yml
index 4cb247419..4087f7c12 100644
--- a/metagpt/tools/functions/schemas/vision.yml
+++ b/metagpt/tools/schemas/image2webpage/GPTvGenerator.yml
@@ -1,4 +1,4 @@
-Vision:
+GPTvGenerator:
   type: class
   description: "Class for generating web pages at once."
   methods:
diff --git a/metagpt/tools/tool_data_type.py b/metagpt/tools/tool_data_type.py
index 8206afa59..45fb539a6 100644
--- a/metagpt/tools/tool_data_type.py
+++ b/metagpt/tools/tool_data_type.py
@@ -10,6 +10,7 @@ class ToolTypeEnum(Enum):
     MODEL_TRAIN = "model_train"
     MODEL_EVALUATE = "model_evaluate"
     STABLE_DIFFUSION = "stable_diffusion"
+    IMAGE2WEBPAGE = "image2webpage"
     OTHER = "other"
 
     def __missing__(self, key):
diff --git a/metagpt/tools/tool_registry.py b/metagpt/tools/tool_registry.py
index 5d743358c..0544d25ee 100644
--- a/metagpt/tools/tool_registry.py
+++ b/metagpt/tools/tool_registry.py
@@ -21,7 +21,7 @@ class ToolRegistry:
     def __init__(self):
         self.tools = {}
         self.tool_types = {}
-        self.tools_by_types = defaultdict(dict)  # two-layer k-v, {tool_type_name: {tool_name: {...}, ...}, ...}
+        self.tools_by_types = defaultdict(dict)  # two-layer k-v, {tool_type: {tool_name: {...}, ...}, ...}
 
     def register_tool_type(self, tool_type: ToolType):
         self.tool_types[tool_type.name] = tool_type
@@ -33,13 +33,13 @@ class ToolRegistry:
         tool_path,
         schema_path=None,
         tool_code="",
-        tool_type_name="other",
+        tool_type="other",
         make_schema_if_not_exists=False,
     ):
         if self.has_tool(tool_name):
             return
 
-        schema_path = schema_path or TOOL_SCHEMA_PATH / tool_type_name / f"{tool_name}.yml"
+        schema_path = schema_path or TOOL_SCHEMA_PATH / tool_type / f"{tool_name}.yml"
 
         if not os.path.exists(schema_path):
             if make_schema_if_not_exists:
@@ -62,7 +62,7 @@ class ToolRegistry:
             # )
         tool = Tool(name=tool_name, path=tool_path, schemas=schemas, code=tool_code)
         self.tools[tool_name] = tool
-        self.tools_by_types[tool_type_name][tool_name] = tool
+        self.tools_by_types[tool_type][tool_name] = tool
         logger.info(f"{tool_name} registered")
 
     def has_tool(self, key):
@@ -94,7 +94,7 @@ def register_tool_type(cls):
     return cls
 
 
-def register_tool(tool_name="", tool_type_name="other", schema_path=None):
+def register_tool(tool_name="", tool_type="other", schema_path=None):
     """register a tool to registry"""
 
     def decorator(cls, tool_name=tool_name):
@@ -111,7 +111,7 @@ def register_tool(tool_name="", tool_type_name="other", schema_path=None):
             tool_path=file_path,
             schema_path=schema_path,
             tool_code=source_code,
-            tool_type_name=tool_type_name,
+            tool_type=tool_type,
         )
         return cls
 
diff --git a/metagpt/tools/tool_types.py b/metagpt/tools/tool_types.py
index 2e22adc40..b5b233d53 100644
--- a/metagpt/tools/tool_types.py
+++ b/metagpt/tools/tool_types.py
@@ -1,6 +1,7 @@
 from metagpt.prompts.tool_types import (
     DATA_PREPROCESS_PROMPT,
     FEATURE_ENGINEERING_PROMPT,
+    IMAGE2WEBPAGE_PROMPT,
     MODEL_EVALUATE_PROMPT,
     MODEL_TRAIN_PROMPT,
 )
@@ -48,6 +49,13 @@ class StableDiffusion(ToolType):
     desc: str = "Related to text2image, image2image using stable diffusion model."
 
 
+@register_tool_type
+class Image2Webpage(ToolType):
+    name: str = ToolTypeEnum.IMAGE2WEBPAGE.value
+    desc: str = "For converting image into webpage code."
+    usage_prompt: str = IMAGE2WEBPAGE_PROMPT
+
+
 @register_tool_type
 class Other(ToolType):
     name: str = ToolTypeEnum.OTHER.value
diff --git a/tests/metagpt/tools/functions/libs/test_vision.py b/tests/metagpt/tools/functions/libs/test_vision.py
deleted file mode 100644
index f4f97c46a..000000000
--- a/tests/metagpt/tools/functions/libs/test_vision.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-@Time    : 2024/01/15
-@Author  : mannaandpoem
-@File    : test_vision.py
-"""
-import pytest
-
-from metagpt import logs
-from metagpt.tools.functions.libs.vision import Vision
-
-
-@pytest.fixture
-def mock_webpages():
-    return """```html\n<html>\n<script src="scripts.js"></script>
-<link rel="stylesheet" href="styles.css(">\n</html>\n```\n
-```css\n.class { ... }\n```\n
-```javascript\nfunction() { ... }\n```\n"""
-
-
-def test_vision_generate_webpages(mocker, mock_webpages):
-    mocker.patch(
-        "metagpt.tools.functions.libs.vision.Vision.generate_web_pages",
-        return_value=mock_webpages
-    )
-    image_path = "image.png"
-    vision = Vision()
-    rsp = vision.generate_web_pages(image_path=image_path)
-    logs.logger.info(rsp)
-    assert "html" in rsp
-    assert "css" in rsp
-    assert "javascript" in rsp
-
-
-def test_save_webpages(mocker, mock_webpages):
-    mocker.patch(
-        "metagpt.tools.functions.libs.vision.Vision.generate_web_pages",
-        return_value=mock_webpages
-    )
-    image_path = "image.png"
-    vision = Vision()
-    webpages = vision.generate_web_pages(image_path)
-    webpages_dir = vision.save_webpages(image_path=image_path, webpages=webpages)
-    logs.logger.info(webpages_dir)
-    assert webpages_dir.exists()
-
-
diff --git a/tests/metagpt/tools/libs/test_gpt_v_generator.py b/tests/metagpt/tools/libs/test_gpt_v_generator.py
new file mode 100644
index 000000000..360ca4a75
--- /dev/null
+++ b/tests/metagpt/tools/libs/test_gpt_v_generator.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time    : 2024/01/15
+@Author  : mannaandpoem
+@File    : test_vision.py
+"""
+import pytest
+
+from metagpt import logs
+from metagpt.tools.libs.gpt_v_generator import GPTvGenerator
+
+
+@pytest.fixture
+def mock_webpages(mocker):
+    mock_data = """```html\n<html>\n<script src="scripts.js"></script>
+<link rel="stylesheet" href="styles.css(">\n</html>\n```\n
+```css\n.class { ... }\n```\n
+```javascript\nfunction() { ... }\n```\n"""
+    mocker.patch("metagpt.tools.libs.gpt_v_generator.GPTvGenerator.generate_web_pages", return_value=mock_data)
+    return mocker
+
+
+def test_vision_generate_webpages(mock_webpages):
+    image_path = "image.png"
+    generator = GPTvGenerator()
+    rsp = generator.generate_web_pages(image_path=image_path)
+    logs.logger.info(rsp)
+    assert "html" in rsp
+    assert "css" in rsp
+    assert "javascript" in rsp
+
+
+def test_save_webpages(mock_webpages):
+    image_path = "image.png"
+    generator = GPTvGenerator()
+    webpages = generator.generate_web_pages(image_path)
+    webpages_dir = generator.save_webpages(image_path=image_path, webpages=webpages)
+    logs.logger.info(webpages_dir)
+    assert webpages_dir.exists()
diff --git a/tests/metagpt/tools/libs/test_sd.py b/tests/metagpt/tools/libs/test_sd_engine.py
similarity index 100%
rename from tests/metagpt/tools/libs/test_sd.py
rename to tests/metagpt/tools/libs/test_sd_engine.py
diff --git a/tests/metagpt/tools/test_tool_registry.py b/tests/metagpt/tools/test_tool_registry.py
index fd758b141..582c368a8 100644
--- a/tests/metagpt/tools/test_tool_registry.py
+++ b/tests/metagpt/tools/test_tool_registry.py
@@ -88,7 +88,7 @@ def test_get_tools_by_type(tool_registry, schema_yaml):
     tool_type = ToolType(name=tool_type_name, desc="test")
     tool_registry.register_tool_type(tool_type)
 
-    tool_registry.register_tool(tool_name, tool_path, tool_type_name=tool_type_name)
+    tool_registry.register_tool(tool_name, tool_path, tool_type=tool_type_name)
 
     tools_by_type = tool_registry.get_tools_by_type(tool_type_name)
     assert tools_by_type is not None

From c32dcca293e2431cecd147e670951a8bb2a8c13d Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Thu, 18 Jan 2024 21:17:34 +0800
Subject: [PATCH 09/12] fix schema reading bug

---
 metagpt/tools/tool_registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metagpt/tools/tool_registry.py b/metagpt/tools/tool_registry.py
index 0544d25ee..52ad25ce4 100644
--- a/metagpt/tools/tool_registry.py
+++ b/metagpt/tools/tool_registry.py
@@ -51,7 +51,7 @@ class ToolRegistry:
 
         with open(schema_path, "r", encoding="utf-8") as f:
             schema_dict = yaml.safe_load(f)
-            schemas = schema_dict.get(tool_name) or dict(schema_dict.values())
+            schemas = schema_dict.get(tool_name) or list(schema_dict.values())[0]
         schemas["tool_path"] = tool_path  # corresponding code file path of the tool
         try:
             ToolSchema(**schemas)  # validation

From 88c4c8c90d25e7d7b46ba453df55106345be6843 Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Thu, 18 Jan 2024 23:26:34 +0800
Subject: [PATCH 10/12] integrate web scraping tool

---
 metagpt/tools/__init__.py                                | 2 +-
 metagpt/tools/functions/libs/scrape_web/__init__.py      | 1 -
 metagpt/tools/libs/__init__.py                           | 3 ++-
 .../scrape_web/scrape_web.py => libs/web_scrapping.py}   | 9 ++++-----
 .../web_scrapping/scrape_web_playwright.yml}             | 2 +-
 metagpt/tools/tool_data_type.py                          | 1 +
 metagpt/tools/tool_types.py                              | 8 +++++++-
 metagpt/tools/web_browser_engine_playwright.py           | 3 ++-
 8 files changed, 18 insertions(+), 11 deletions(-)
 delete mode 100644 metagpt/tools/functions/libs/scrape_web/__init__.py
 rename metagpt/tools/{functions/libs/scrape_web/scrape_web.py => libs/web_scrapping.py} (76%)
 rename metagpt/tools/{functions/schemas/scrape_web.yml => schemas/web_scrapping/scrape_web_playwright.yml} (96%)

diff --git a/metagpt/tools/__init__.py b/metagpt/tools/__init__.py
index f18d1d276..bb87f1b62 100644
--- a/metagpt/tools/__init__.py
+++ b/metagpt/tools/__init__.py
@@ -11,7 +11,7 @@ from metagpt.tools import tool_types  # this registers all tool types
 from metagpt.tools import libs  # this registers all tools
 from metagpt.tools.tool_registry import TOOL_REGISTRY
 
-_, _, _ = tool_types, libs, TOOL_REGISTRY  # Avoid pre-commit error
+_ = tool_types, libs, TOOL_REGISTRY  # Avoid pre-commit error
 
 
 class SearchEngineType(Enum):
diff --git a/metagpt/tools/functions/libs/scrape_web/__init__.py b/metagpt/tools/functions/libs/scrape_web/__init__.py
deleted file mode 100644
index d5cd1524b..000000000
--- a/metagpt/tools/functions/libs/scrape_web/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from metagpt.tools.functions.libs.scrape_web.scrape_web import scrape_web
diff --git a/metagpt/tools/libs/__init__.py b/metagpt/tools/libs/__init__.py
index b576997c9..442f57149 100644
--- a/metagpt/tools/libs/__init__.py
+++ b/metagpt/tools/libs/__init__.py
@@ -9,6 +9,7 @@ from metagpt.tools.libs import (
     feature_engineering,
     sd_engine,
     gpt_v_generator,
+    web_scrapping,
 )
 
-_, _, _, _ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator  # Avoid pre-commit error
+_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scrapping  # Avoid pre-commit error
diff --git a/metagpt/tools/functions/libs/scrape_web/scrape_web.py b/metagpt/tools/libs/web_scrapping.py
similarity index 76%
rename from metagpt/tools/functions/libs/scrape_web/scrape_web.py
rename to metagpt/tools/libs/web_scrapping.py
index e68ce0e64..e8e73f123 100644
--- a/metagpt/tools/functions/libs/scrape_web/scrape_web.py
+++ b/metagpt/tools/libs/web_scrapping.py
@@ -1,9 +1,10 @@
-import asyncio
-
+from metagpt.tools.tool_data_type import ToolTypeEnum
+from metagpt.tools.tool_registry import register_tool
 from metagpt.tools.web_browser_engine_playwright import PlaywrightWrapper
 
 
-async def scrape_web(url, *urls):
+@register_tool(tool_type=ToolTypeEnum.WEBSCRAPING.value)
+async def scrape_web_playwright(url, *urls):
     """
     Scrape and save the HTML structure and inner text content of a web page using Playwright.
 
@@ -19,5 +20,3 @@ async def scrape_web(url, *urls):
 
     # Return the inner text content of the web page
     return {"inner_text": web.inner_text, "html": web.html}
-
-# 需要改三个地方: yaml, 对应路径下init, MetaGPT/metagpt/prompts/ml_engineer.py中ML_MODULE_MAP
diff --git a/metagpt/tools/functions/schemas/scrape_web.yml b/metagpt/tools/schemas/web_scrapping/scrape_web_playwright.yml
similarity index 96%
rename from metagpt/tools/functions/schemas/scrape_web.yml
rename to metagpt/tools/schemas/web_scrapping/scrape_web_playwright.yml
index ecca3fbed..a6ff7d6c7 100644
--- a/metagpt/tools/functions/schemas/scrape_web.yml
+++ b/metagpt/tools/schemas/web_scrapping/scrape_web_playwright.yml
@@ -1,4 +1,4 @@
-scrape_web:
+scrape_web_playwright:
   type: async funciton
   description: "Scrape and save the HTML structure and inner text content of a web page using Playwright."
   parameters:
diff --git a/metagpt/tools/tool_data_type.py b/metagpt/tools/tool_data_type.py
index 45fb539a6..0c4eea4cc 100644
--- a/metagpt/tools/tool_data_type.py
+++ b/metagpt/tools/tool_data_type.py
@@ -11,6 +11,7 @@ class ToolTypeEnum(Enum):
     MODEL_EVALUATE = "model_evaluate"
     STABLE_DIFFUSION = "stable_diffusion"
     IMAGE2WEBPAGE = "image2webpage"
+    WEBSCRAPING = "web_scraping"
     OTHER = "other"
 
     def __missing__(self, key):
diff --git a/metagpt/tools/tool_types.py b/metagpt/tools/tool_types.py
index b5b233d53..35c0772b1 100644
--- a/metagpt/tools/tool_types.py
+++ b/metagpt/tools/tool_types.py
@@ -12,7 +12,7 @@ from metagpt.tools.tool_registry import register_tool_type
 @register_tool_type
 class EDA(ToolType):
     name: str = ToolTypeEnum.EDA.value
-    desc: str = "Useful for performing exploratory data analysis"
+    desc: str = "For performing exploratory data analysis"
 
 
 @register_tool_type
@@ -56,6 +56,12 @@ class Image2Webpage(ToolType):
     usage_prompt: str = IMAGE2WEBPAGE_PROMPT
 
 
+@register_tool_type
+class WebScraping(ToolType):
+    name: str = ToolTypeEnum.WEBSCRAPING.value
+    desc: str = "For scraping data from web pages."
+
+
 @register_tool_type
 class Other(ToolType):
     name: str = ToolTypeEnum.OTHER.value
diff --git a/metagpt/tools/web_browser_engine_playwright.py b/metagpt/tools/web_browser_engine_playwright.py
index a45f6a12e..15c8a78d7 100644
--- a/metagpt/tools/web_browser_engine_playwright.py
+++ b/metagpt/tools/web_browser_engine_playwright.py
@@ -12,7 +12,6 @@ from typing import Literal
 
 from playwright.async_api import async_playwright
 
-from metagpt.config import CONFIG
 from metagpt.logs import logger
 from metagpt.utils.parse_html import WebPage
 
@@ -32,6 +31,8 @@ class PlaywrightWrapper:
         launch_kwargs: dict | None = None,
         **kwargs,
     ) -> None:
+        from metagpt.config import CONFIG
+
         if browser_type is None:
             browser_type = CONFIG.playwright_browser_type
         self.browser_type = browser_type

From 3faa094248d819a178156471c9990089b9a8d5a7 Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Thu, 18 Jan 2024 23:45:37 +0800
Subject: [PATCH 11/12] fix aask_code issues in ml_engineer

---
 metagpt/actions/debug_code.py          |  3 +--
 metagpt/actions/ml_da_action.py        |  2 +-
 metagpt/actions/write_analysis_code.py |  8 ++++----
 metagpt/roles/code_interpreter.py      | 11 ++++-------
 metagpt/roles/ml_engineer.py           |  4 ++--
 5 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/metagpt/actions/debug_code.py b/metagpt/actions/debug_code.py
index e5e0ac5d4..121c126c4 100644
--- a/metagpt/actions/debug_code.py
+++ b/metagpt/actions/debug_code.py
@@ -119,5 +119,4 @@ class DebugCode(BaseWriteAnalysisCode):
             runtime_result=runtime_result,
         )
         # 根据reflection结果重写代码
-        improv_code = reflection["improved_impl"]
-        return improv_code
+        return {"code": reflection["improved_impl"]}
diff --git a/metagpt/actions/ml_da_action.py b/metagpt/actions/ml_da_action.py
index 584c4db7a..d4e77773f 100644
--- a/metagpt/actions/ml_da_action.py
+++ b/metagpt/actions/ml_da_action.py
@@ -63,4 +63,4 @@ class UpdateDataColumns(Action):
         prompt = UPDATE_DATA_COLUMNS.format(history_code=code_context)
         tool_config = create_func_config(PRINT_DATA_COLUMNS)
         rsp = await self.llm.aask_code(prompt, **tool_config)
-        return rsp["code"]
+        return rsp
diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py
index 65be198ef..cf806a986 100644
--- a/metagpt/actions/write_analysis_code.py
+++ b/metagpt/actions/write_analysis_code.py
@@ -59,7 +59,7 @@ class BaseWriteAnalysisCode(Action):
             }
         return messages
 
-    async def run(self, context: List[Message], plan: Plan = None) -> str:
+    async def run(self, context: List[Message], plan: Plan = None) -> dict:
         """Run of a code writing action, used in data analysis or modeling
 
         Args:
@@ -67,7 +67,7 @@ class BaseWriteAnalysisCode(Action):
             plan (Plan, optional): Overall plan. Defaults to None.
 
         Returns:
-            str: The code string.
+            dict: code result in the format of {"code": "print('hello world')", "language": "python"}
         """
 
 
@@ -174,7 +174,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
 
         tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS)
         rsp = await self.llm.aask_code(prompt, **tool_config)
-        return rsp["code"]
+        return rsp
 
 
 class WriteCodeWithToolsML(WriteCodeWithTools):
@@ -230,7 +230,7 @@ class WriteCodeWithToolsML(WriteCodeWithTools):
         tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS)
         rsp = await self.llm.aask_code(prompt, **tool_config)
         context = [Message(content=prompt, role="user")]
-        return context, rsp["code"]
+        return context, rsp
 
 
 class MakeTools(WriteCodeByGenerate):
diff --git a/metagpt/roles/code_interpreter.py b/metagpt/roles/code_interpreter.py
index 46cc00d5e..f972e72e2 100644
--- a/metagpt/roles/code_interpreter.py
+++ b/metagpt/roles/code_interpreter.py
@@ -54,7 +54,7 @@ class CodeInterpreter(Role):
 
     async def _act_on_task(self, current_task: Task) -> TaskResult:
         code, result, is_success = await self._write_and_exec_code()
-        task_result = TaskResult(code=code['code'], result=result, is_success=is_success)
+        task_result = TaskResult(code=code, result=result, is_success=is_success)
         return task_result
 
     async def _write_and_exec_code(self, max_retry: int = 3):
@@ -69,7 +69,7 @@ class CodeInterpreter(Role):
             ### write code ###
             code, cause_by = await self._write_code()
 
-            self.working_memory.add(Message(content=code['code'], role="assistant", cause_by=cause_by))
+            self.working_memory.add(Message(content=code["code"], role="assistant", cause_by=cause_by))
 
             ### execute code ###
             result, success = await self.execute_code.run(**code)
@@ -78,7 +78,7 @@ class CodeInterpreter(Role):
             self.working_memory.add(Message(content=result, role="user", cause_by=ExecutePyCode))
 
             ### process execution result ###
-            if "!pip" in code:
+            if "!pip" in code["code"]:
                 success = False
 
             counter += 1
@@ -89,7 +89,7 @@ class CodeInterpreter(Role):
                 if ReviewConst.CHANGE_WORD[0] in review:
                     counter = 0  # redo the task again with help of human suggestions
 
-        return code, result, success
+        return code["code"], result, success
 
     async def _write_code(self):
         todo = WriteCodeByGenerate() if not self.use_tools else WriteCodeWithTools()
@@ -98,9 +98,6 @@ class CodeInterpreter(Role):
         context = self.planner.get_useful_memories()
         # print(*context, sep="\n***\n")
         code = await todo.run(context=context, plan=self.planner.plan, temperature=0.0)
-        # 暂时在这里转换 WriteCodeWithTools 的输出
-        if isinstance(code, str):
-            code = {'code': code, 'language': 'python'}
 
         return code, todo
 
diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py
index aeea39c0c..6b671f9c2 100644
--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@@ -46,7 +46,7 @@ class MLEngineer(CodeInterpreter):
             logger.info(f"new code \n{code}")
             cause_by = DebugCode
 
-        self.latest_code = code
+        self.latest_code = code["code"]
 
         return code, cause_by
 
@@ -61,6 +61,6 @@ class MLEngineer(CodeInterpreter):
         logger.info("Check columns in updated data")
         code = await UpdateDataColumns().run(self.planner.plan)
         success = False
-        result, success = await self.execute_code.run(code)
+        result, success = await self.execute_code.run(**code)
         print(result)
         return result if success else ""

From 23fccdde67f50fed24906f22c5f3f8c0a58002da Mon Sep 17 00:00:00 2001
From: yzlin <yzlin@fuzhi.ai>
Date: Fri, 19 Jan 2024 00:09:58 +0800
Subject: [PATCH 12/12] update mock llm aask_code

---
 tests/mock/mock_llm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/mock/mock_llm.py b/tests/mock/mock_llm.py
index 45b28c63b..a52aeed09 100644
--- a/tests/mock/mock_llm.py
+++ b/tests/mock/mock_llm.py
@@ -69,7 +69,6 @@ class MockLLM(OriginalLLM):
         A copy of metagpt.provider.openai_api.OpenAILLM.aask_code, we can't use super().aask because it will be mocked.
         Since openai_api.OpenAILLM.aask_code is different from base_llm.BaseLLM.aask_code, we use the former.
         """
-        messages = self._process_message(messages)
         rsp = await self._achat_completion_function(messages, **kwargs)
         return self.get_choice_function_arguments(rsp)