Merge branch 'tool_manage_new' into 'code_intepreter'

convert local class or function to tool, tool clarification at role initialization See merge request agents/data_agents_opt!55
2026-05-12 01:02:37 +02:00 · 2024-01-22 09:12:02 +00:00 · 2024-01-22 09:12:02 +00:00 · 7f5f95d41b
commit 7f5f95d41b
parent 8ed51b10b3 33e13b677b
18 changed files with 807 additions and 147 deletions
--- a/metagpt/tools/libs/init.py
+++ b/metagpt/tools/libs/init.py
@ -9,7 +9,7 @@ from metagpt.tools.libs import (
    feature_engineering,
    sd_engine,
    gpt_v_generator,
-    web_scrapping,
+    web_scraping,
 )

-_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scrapping  # Avoid pre-commit error
+_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scraping  # Avoid pre-commit error
--- a/metagpt/tools/libs/data_preprocess.py
+++ b/metagpt/tools/libs/data_preprocess.py
@ -26,31 +26,64 @@ class MLProcess(object):
    def transform(self, df):
        raise NotImplementedError

-    def fit_transform(self, df):
+    def fit_transform(self, df) -> pd.DataFrame:
+        """
+        Fit and transform the input DataFrame.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+
+        Returns:
+            pd.DataFrame: The transformed DataFrame.
+        """
        self.fit(df)
        return self.transform(df)


@register_tool(tool_type=TOOL_TYPE)
 class FillMissingValue(MLProcess):
-    def __init__(
-        self,
-        features: list,
-        strategy: str = "mean",
-        fill_value=None,
-    ):
+    """
+    Completing missing values with simple strategies.
+    """
+
+    def __init__(self, features: list, strategy: str = "mean", fill_value=None):
+        """
+        Initialize self.
+
+        Args:
+            features (list): Columns to be processed.
+            strategy (str, optional): The imputation strategy, notice 'mean' and 'median' can only
+                                      be used for numeric features. Enum: ['mean', 'median', 'most_frequent', 'constant']. Defaults to 'mean'.
+            fill_value (int, optional): Fill_value is used to replace all occurrences of missing_values.
+                                        Defaults to None.
+        """
        self.features = features
        self.strategy = strategy
        self.fill_value = fill_value
        self.si = None

    def fit(self, df: pd.DataFrame):
+        """
+        Fit the FillMissingValue model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+        """
        if len(self.features) == 0:
            return
        self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
        self.si.fit(df[self.features])

-    def transform(self, df: pd.DataFrame):
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Transform the input DataFrame with the fitted model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+
+        Returns:
+            pd.DataFrame: The transformed DataFrame.
+        """
        if len(self.features) == 0:
            return df
        new_df = df.copy()
@ -60,18 +93,40 @@ class FillMissingValue(MLProcess):

@register_tool(tool_type=TOOL_TYPE)
 class MinMaxScale(MLProcess):
-    def __init__(
-        self,
-        features: list,
-    ):
+    """
+    Transform features by scaling each feature to a range, which is (0, 1).
+    """
+
+    def __init__(self, features: list):
+        """
+        Initialize self.
+
+        Args:
+            features (list): Columns to be processed.
+        """
        self.features = features
        self.mms = None

    def fit(self, df: pd.DataFrame):
+        """
+        Fit the MinMaxScale model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+        """
        self.mms = MinMaxScaler()
        self.mms.fit(df[self.features])

-    def transform(self, df: pd.DataFrame):
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Transform the input DataFrame with the fitted model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+
+        Returns:
+            pd.DataFrame: The transformed DataFrame.
+        """
        new_df = df.copy()
        new_df[self.features] = self.mms.transform(new_df[self.features])
        return new_df
@ -79,18 +134,40 @@ class MinMaxScale(MLProcess):

@register_tool(tool_type=TOOL_TYPE)
 class StandardScale(MLProcess):
-    def __init__(
-        self,
-        features: list,
-    ):
+    """
+    Standardize features by removing the mean and scaling to unit variance.
+    """
+
+    def __init__(self, features: list):
+        """
+        Initialize self.
+
+        Args:
+            features (list): Columns to be processed.
+        """
        self.features = features
        self.ss = None

    def fit(self, df: pd.DataFrame):
+        """
+        Fit the StandardScale model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+        """
        self.ss = StandardScaler()
        self.ss.fit(df[self.features])

-    def transform(self, df: pd.DataFrame):
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Transform the input DataFrame with the fitted model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+
+        Returns:
+            pd.DataFrame: The transformed DataFrame.
+        """
        new_df = df.copy()
        new_df[self.features] = self.ss.transform(new_df[self.features])
        return new_df
@ -98,18 +175,40 @@ class StandardScale(MLProcess):

@register_tool(tool_type=TOOL_TYPE)
 class MaxAbsScale(MLProcess):
-    def __init__(
-        self,
-        features: list,
-    ):
+    """
+    Scale each feature by its maximum absolute value.
+    """
+
+    def __init__(self, features: list):
+        """
+        Initialize self.
+
+        Args:
+            features (list): Columns to be processed.
+        """
        self.features = features
        self.mas = None

    def fit(self, df: pd.DataFrame):
+        """
+        Fit the MaxAbsScale model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+        """
        self.mas = MaxAbsScaler()
        self.mas.fit(df[self.features])

-    def transform(self, df: pd.DataFrame):
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Transform the input DataFrame with the fitted model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+
+        Returns:
+            pd.DataFrame: The transformed DataFrame.
+        """
        new_df = df.copy()
        new_df[self.features] = self.mas.transform(new_df[self.features])
        return new_df
@ -117,18 +216,40 @@ class MaxAbsScale(MLProcess):

@register_tool(tool_type=TOOL_TYPE)
 class RobustScale(MLProcess):
-    def __init__(
-        self,
-        features: list,
-    ):
+    """
+    Apply the RobustScaler to scale features using statistics that are robust to outliers.
+    """
+
+    def __init__(self, features: list):
+        """
+        Initialize the RobustScale instance with feature names.
+
+        Args:
+            features (list): List of feature names to be scaled.
+        """
        self.features = features
        self.rs = None

    def fit(self, df: pd.DataFrame):
+        """
+        Compute the median and IQR for scaling.
+
+        Args:
+            df (pd.DataFrame): Dataframe containing the features.
+        """
        self.rs = RobustScaler()
        self.rs.fit(df[self.features])

    def transform(self, df: pd.DataFrame):
+        """
+        Scale features using the previously computed median and IQR.
+
+        Args:
+            df (pd.DataFrame): Dataframe containing the features to be scaled.
+
+        Returns:
+            pd.DataFrame: A new dataframe with scaled features.
+        """
        new_df = df.copy()
        new_df[self.features] = self.rs.transform(new_df[self.features])
        return new_df
@ -136,18 +257,40 @@ class RobustScale(MLProcess):

@register_tool(tool_type=TOOL_TYPE)
 class OrdinalEncode(MLProcess):
-    def __init__(
-        self,
-        features: list,
-    ):
+    """
+    Encode categorical features as ordinal integers.
+    """
+
+    def __init__(self, features: list):
+        """
+        Initialize the OrdinalEncode instance with feature names.
+
+        Args:
+            features (list): List of categorical feature names to be encoded.
+        """
        self.features = features
        self.oe = None

    def fit(self, df: pd.DataFrame):
+        """
+        Learn the ordinal encodings for the features.
+
+        Args:
+            df (pd.DataFrame): Dataframe containing the categorical features.
+        """
        self.oe = OrdinalEncoder()
        self.oe.fit(df[self.features])

    def transform(self, df: pd.DataFrame):
+        """
+        Convert the categorical features to ordinal integers.
+
+        Args:
+            df (pd.DataFrame): Dataframe containing the categorical features to be encoded.
+
+        Returns:
+            pd.DataFrame: A new dataframe with the encoded features.
+        """
        new_df = df.copy()
        new_df[self.features] = self.oe.transform(new_df[self.features])
        return new_df
@ -155,18 +298,40 @@ class OrdinalEncode(MLProcess):

@register_tool(tool_type=TOOL_TYPE)
 class OneHotEncode(MLProcess):
-    def __init__(
-        self,
-        features: list,
-    ):
+    """
+    Apply one-hot encoding to specified categorical columns, the original columns will be dropped.
+    """
+
+    def __init__(self, features: list):
+        """
+        Initialize self.
+
+        Args:
+            features (list): Categorical columns to be one-hot encoded and dropped.
+        """
        self.features = features
        self.ohe = None

    def fit(self, df: pd.DataFrame):
+        """
+        Fit the OneHotEncoding model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+        """
        self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
        self.ohe.fit(df[self.features])

-    def transform(self, df: pd.DataFrame):
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Transform the input DataFrame with the fitted model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+
+        Returns:
+            pd.DataFrame: The transformed DataFrame.
+        """
        ts_data = self.ohe.transform(df[self.features])
        new_columns = self.ohe.get_feature_names_out(self.features)
        ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
@ -177,21 +342,43 @@ class OneHotEncode(MLProcess):

@register_tool(tool_type=TOOL_TYPE)
 class LabelEncode(MLProcess):
-    def __init__(
-        self,
-        features: list,
-    ):
+    """
+    Apply label encoding to specified categorical columns in-place.
+    """
+
+    def __init__(self, features: list):
+        """
+        Initialize self.
+
+        Args:
+            features (list): Categorical columns to be label encoded.
+        """
        self.features = features
        self.le_encoders = []

    def fit(self, df: pd.DataFrame):
+        """
+        Fit the LabelEncode model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+        """
        if len(self.features) == 0:
            return
        for col in self.features:
            le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ["unknown"])
            self.le_encoders.append(le)

-    def transform(self, df: pd.DataFrame):
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Transform the input DataFrame with the fitted model.
+
+        Args:
+            df (pd.DataFrame): The input DataFrame.
+
+        Returns:
+            pd.DataFrame: The transformed DataFrame.
+        """
        if len(self.features) == 0:
            return df
        new_df = df.copy()
@ -204,8 +391,17 @@ class LabelEncode(MLProcess):
        return new_df


-@register_tool(tool_type=TOOL_TYPE)
 def get_column_info(df: pd.DataFrame) -> dict:
+    """
+    Analyzes a DataFrame and categorizes its columns based on data types.
+
+    Args:
+        df (pd.DataFrame): The DataFrame to be analyzed.
+
+    Returns:
+        dict: A dictionary with four keys ('Category', 'Numeric', 'Datetime', 'Others').
+              Each key corresponds to a list of column names belonging to that category.
+    """
    column_info = {
        "Category": [],
        "Numeric": [],
--- a/metagpt/tools/libs/feature_engineering.py
+++ b/metagpt/tools/libs/feature_engineering.py
@ -184,7 +184,7 @@ class SplitBins(MLProcess):
        return new_df


-@register_tool(tool_type=TOOL_TYPE)
+# @register_tool(tool_type=TOOL_TYPE)
 class ExtractTimeComps(MLProcess):
    def __init__(self, time_col: str, time_comps: list):
        self.time_col = time_col
@ -242,6 +242,7 @@ class GeneralSelection(MLProcess):


 # skip for now because lgb is needed
+# @register_tool(tool_type=TOOL_TYPE)
 class TreeBasedSelection(MLProcess):
    def __init__(self, label_col: str, task_type: str):
        self.label_col = label_col
--- a/metagpt/tools/libs/web_scrapping.py
+++ b/metagpt/tools/libs/web_scrapping.py
--- a/metagpt/tools/schemas/data_preprocess/OrdinalEncode.yml
+++ b/metagpt/tools/schemas/data_preprocess/OrdinalEncode.yml
@ -0,0 +1,46 @@
+OrdinalEncode:
+  type: class
+  description: Encode categorical features as ordinal integers.
+  methods:
+    __init__:
+      description: 'Initialize the OrdinalEncode instance with feature names. '
+      parameters:
+        properties:
+          features:
+            type: list
+            description: List of categorical feature names to be encoded.
+        required:
+        - features
+    fit:
+      description: 'Learn the ordinal encodings for the features. '
+      parameters:
+        properties:
+          df:
+            type: pd.DataFrame
+            description: Dataframe containing the categorical features.
+        required:
+        - df
+    fit_transform:
+      description: 'Fit and transform the input DataFrame. '
+      parameters:
+        properties:
+          df:
+            type: pd.DataFrame
+            description: The input DataFrame.
+        required:
+        - df
+      returns:
+      - type: pd.DataFrame
+        description: The transformed DataFrame.
+    transform:
+      description: 'Convert the categorical features to ordinal integers. '
+      parameters:
+        properties:
+          df:
+            type: pd.DataFrame
+            description: Dataframe containing the categorical features to be encoded.
+        required:
+        - df
+      returns:
+      - type: pd.DataFrame
+        description: A new dataframe with the encoded features.
--- a/metagpt/tools/schemas/data_preprocess/RobustScale.yml
+++ b/metagpt/tools/schemas/data_preprocess/RobustScale.yml
@ -0,0 +1,47 @@
+RobustScale:
+  type: class
+  description: Apply the RobustScaler to scale features using statistics that are
+    robust to outliers.
+  methods:
+    __init__:
+      description: 'Initialize the RobustScale instance with feature names. '
+      parameters:
+        properties:
+          features:
+            type: list
+            description: List of feature names to be scaled.
+        required:
+        - features
+    fit:
+      description: 'Compute the median and IQR for scaling. '
+      parameters:
+        properties:
+          df:
+            type: pd.DataFrame
+            description: Dataframe containing the features.
+        required:
+        - df
+    fit_transform:
+      description: 'Fit and transform the input DataFrame. '
+      parameters:
+        properties:
+          df:
+            type: pd.DataFrame
+            description: The input DataFrame.
+        required:
+        - df
+      returns:
+      - type: pd.DataFrame
+        description: The transformed DataFrame.
+    transform:
+      description: 'Scale features using the previously computed median and IQR. '
+      parameters:
+        properties:
+          df:
+            type: pd.DataFrame
+            description: Dataframe containing the features to be scaled.
+        required:
+        - df
+      returns:
+      - type: pd.DataFrame
+        description: A new dataframe with scaled features.
--- a/metagpt/tools/schemas/web_scrapping/scrape_web_playwright.yml
+++ b/metagpt/tools/schemas/web_scrapping/scrape_web_playwright.yml
--- a/metagpt/tools/tool_convert.py
+++ b/metagpt/tools/tool_convert.py
@ -0,0 +1,72 @@
+import inspect
+
+from metagpt.utils.parse_docstring import GoogleDocstringParser, remove_spaces
+
+
+def convert_code_to_tool_schema(obj, include: list[str] = []):
+    docstring = inspect.getdoc(obj)
+    assert docstring, "no docstring found for the objects, skip registering"
+
+    if inspect.isclass(obj):
+        schema = {"type": "class", "description": remove_spaces(docstring), "methods": {}}
+        for name, method in inspect.getmembers(obj, inspect.isfunction):
+            if include and name not in include:
+                continue
+            method_doc = inspect.getdoc(method)
+            if method_doc:
+                schema["methods"][name] = docstring_to_schema(method_doc)
+
+    elif inspect.isfunction(obj):
+        schema = {
+            "type": "function",
+            **docstring_to_schema(docstring),
+        }
+
+    schema = {obj.__name__: schema}
+
+    return schema
+
+
+def docstring_to_schema(docstring: str):
+    if docstring is None:
+        return {}
+
+    parser = GoogleDocstringParser(docstring=docstring)
+
+    # 匹配简介部分
+    description = parser.parse_desc()
+
+    # 匹配Args部分
+    params = parser.parse_params()
+    parameter_schema = {"properties": {}, "required": []}
+    for param in params:
+        param_name, param_type, param_desc = param
+        # check required or optional
+        is_optional, param_type = parser.check_and_parse_optional(param_type)
+        if not is_optional:
+            parameter_schema["required"].append(param_name)
+        # type and desc
+        param_dict = {"type": param_type, "description": remove_spaces(param_desc)}
+        # match Default for optional args
+        has_default_val, default_val = parser.check_and_parse_default_value(param_desc)
+        if has_default_val:
+            param_dict["default"] = default_val
+        # match Enum
+        has_enum, enum_vals = parser.check_and_parse_enum(param_desc)
+        if has_enum:
+            param_dict["enum"] = enum_vals
+        # add to parameter schema
+        parameter_schema["properties"].update({param_name: param_dict})
+
+    # 匹配Returns部分
+    returns = parser.parse_returns()
+
+    # 构建YAML字典
+    schema = {
+        "description": description,
+        "parameters": parameter_schema,
+    }
+    if returns:
+        schema["returns"] = [{"type": ret[0], "description": remove_spaces(ret[1])} for ret in returns]
+
+    return schema
--- a/metagpt/tools/tool_registry.py
+++ b/metagpt/tools/tool_registry.py
@ -11,17 +11,18 @@ import re
 from collections import defaultdict

 import yaml
+from pydantic import BaseModel

 from metagpt.const import TOOL_SCHEMA_PATH
 from metagpt.logs import logger
+from metagpt.tools.tool_convert import convert_code_to_tool_schema
 from metagpt.tools.tool_data_type import Tool, ToolSchema, ToolType


-class ToolRegistry:
-    def __init__(self):
-        self.tools = {}
-        self.tool_types = {}
-        self.tools_by_types = defaultdict(dict)  # two-layer k-v, {tool_type: {tool_name: {...}, ...}, ...}
+class ToolRegistry(BaseModel):
+    tools: dict = {}
+    tool_types: dict = {}
+    tools_by_types: dict = defaultdict(dict)  # two-layer k-v, {tool_type: {tool_name: {...}, ...}, ...}

    def register_tool_type(self, tool_type: ToolType):
        self.tool_types[tool_type.name] = tool_type
@ -34,7 +35,9 @@ class ToolRegistry:
        schema_path=None,
        tool_code="",
        tool_type="other",
-        make_schema_if_not_exists=False,
+        tool_source_object=None,
+        include_functions=[],
+        make_schema_if_not_exists=True,
    ):
        if self.has_tool(tool_name):
            return
@ -44,14 +47,16 @@ class ToolRegistry:
        if not os.path.exists(schema_path):
            if make_schema_if_not_exists:
                logger.warning(f"no schema found, will make schema at {schema_path}")
-                make_schema(tool_code, schema_path)
+                schema_dict = make_schema(tool_source_object, include_functions, schema_path)
            else:
                logger.warning(f"no schema found at assumed schema_path {schema_path}, skip registering {tool_name}")
                return
-
-        with open(schema_path, "r", encoding="utf-8") as f:
-            schema_dict = yaml.safe_load(f)
-            schemas = schema_dict.get(tool_name) or list(schema_dict.values())[0]
+        else:
+            with open(schema_path, "r", encoding="utf-8") as f:
+                schema_dict = yaml.safe_load(f)
+        if not schema_dict:
+            return
+        schemas = schema_dict.get(tool_name) or list(schema_dict.values())[0]
        schemas["tool_path"] = tool_path  # corresponding code file path of the tool
        try:
            ToolSchema(**schemas)  # validation
@ -65,22 +70,22 @@ class ToolRegistry:
        self.tools_by_types[tool_type][tool_name] = tool
        logger.info(f"{tool_name} registered")

-    def has_tool(self, key):
+    def has_tool(self, key: str) -> Tool:
        return key in self.tools

-    def get_tool(self, key):
+    def get_tool(self, key) -> Tool:
        return self.tools.get(key)

-    def get_tools_by_type(self, key):
-        return self.tools_by_types.get(key)
+    def get_tools_by_type(self, key) -> dict[str, Tool]:
+        return self.tools_by_types.get(key, {})

-    def has_tool_type(self, key):
+    def has_tool_type(self, key) -> bool:
        return key in self.tool_types

-    def get_tool_type(self, key):
+    def get_tool_type(self, key) -> ToolType:
        return self.tool_types.get(key)

-    def get_tool_types(self):
+    def get_tool_types(self) -> dict[str, ToolType]:
        return self.tool_types


@ -94,7 +99,7 @@ def register_tool_type(cls):
    return cls


-def register_tool(tool_name="", tool_type="other", schema_path=None):
+def register_tool(tool_name="", tool_type="other", schema_path=None, **kwargs):
    """register a tool to registry"""

    def decorator(cls, tool_name=tool_name):
@ -112,15 +117,39 @@ def register_tool(tool_name="", tool_type="other", schema_path=None):
            schema_path=schema_path,
            tool_code=source_code,
            tool_type=tool_type,
+            tool_source_object=cls,
+            **kwargs,
        )
        return cls

    return decorator


-def make_schema(tool_code, path):
+def make_schema(tool_source_object, include, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)  # Create the necessary directories
-    schema = {}  # an empty schema for now
-    with open(path, "w", encoding="utf-8") as f:
-        yaml.dump(schema, f)
-    return path
+    try:
+        schema = convert_code_to_tool_schema(tool_source_object, include=include)
+        with open(path, "w", encoding="utf-8") as f:
+            yaml.dump(schema, f, sort_keys=False)
+        # import json
+        # with open(str(path).replace("yml", "json"), "w", encoding="utf-8") as f:
+        #     json.dump(schema, f, ensure_ascii=False, indent=4)
+        logger.info(f"schema made at {path}")
+    except Exception as e:
+        schema = {}
+        logger.error(f"Fail to make schema: {e}")
+
+    return schema
+
+
+def validate_tool_names(tools: list[str], return_tool_object=False) -> list[str]:
+    valid_tools = []
+    for tool_name in tools:
+        if not TOOL_REGISTRY.has_tool(tool_name):
+            logger.warning(
+                f"Specified tool {tool_name} not found and was skipped. Check if you have registered it properly"
+            )
+        else:
+            valid_tool = TOOL_REGISTRY.get_tool(tool_name) if return_tool_object else tool_name
+            valid_tools.append(valid_tool)
+    return valid_tools