make tool yaml from class or func docstring

This commit is contained in:
yzlin 2024-01-19 22:29:37 +08:00
parent 23fccdde67
commit c4a60d89e0
9 changed files with 449 additions and 57 deletions

View file

@ -9,7 +9,7 @@ from metagpt.tools.libs import (
feature_engineering,
sd_engine,
gpt_v_generator,
web_scrapping,
web_scraping,
)
_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scrapping # Avoid pre-commit error
_ = data_preprocess, feature_engineering, sd_engine, gpt_v_generator, web_scraping # Avoid pre-commit error

View file

@ -26,31 +26,64 @@ class MLProcess(object):
def transform(self, df):
raise NotImplementedError
def fit_transform(self, df):
def fit_transform(self, df) -> pd.DataFrame:
"""
Fit and transform the input DataFrame.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
self.fit(df)
return self.transform(df)
@register_tool(tool_type=TOOL_TYPE)
class FillMissingValue(MLProcess):
def __init__(
self,
features: list,
strategy: str = "mean",
fill_value=None,
):
"""
Completing missing values with simple strategies.
"""
def __init__(self, features: list, strategy: str = "mean", fill_value=None):
"""
Initialize self.
Args:
features (list): Columns to be processed.
strategy (str, optional): The imputation strategy, notice 'mean' and 'median' can only
be used for numeric features. Enum: ['mean', 'median', 'most_frequent', 'constant']. Defaults to 'mean'.
fill_value (int, optional): Fill_value is used to replace all occurrences of missing_values.
Defaults to None.
"""
self.features = features
self.strategy = strategy
self.fill_value = fill_value
self.si = None
def fit(self, df: pd.DataFrame):
"""
Fit the FillMissingValue model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
if len(self.features) == 0:
return
self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
self.si.fit(df[self.features])
def transform(self, df: pd.DataFrame):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
if len(self.features) == 0:
return df
new_df = df.copy()
@ -60,18 +93,40 @@ class FillMissingValue(MLProcess):
@register_tool(tool_type=TOOL_TYPE)
class MinMaxScale(MLProcess):
def __init__(
self,
features: list,
):
"""
Transform features by scaling each feature to a range, which is (0, 1).
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Columns to be processed.
"""
self.features = features
self.mms = None
def fit(self, df: pd.DataFrame):
"""
Fit the MinMaxScale model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.mms = MinMaxScaler()
self.mms.fit(df[self.features])
def transform(self, df: pd.DataFrame):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[self.features] = self.mms.transform(new_df[self.features])
return new_df
@ -79,18 +134,40 @@ class MinMaxScale(MLProcess):
@register_tool(tool_type=TOOL_TYPE)
class StandardScale(MLProcess):
def __init__(
self,
features: list,
):
"""
Standardize features by removing the mean and scaling to unit variance.
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Columns to be processed.
"""
self.features = features
self.ss = None
def fit(self, df: pd.DataFrame):
"""
Fit the StandardScale model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.ss = StandardScaler()
self.ss.fit(df[self.features])
def transform(self, df: pd.DataFrame):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[self.features] = self.ss.transform(new_df[self.features])
return new_df
@ -98,18 +175,40 @@ class StandardScale(MLProcess):
@register_tool(tool_type=TOOL_TYPE)
class MaxAbsScale(MLProcess):
def __init__(
self,
features: list,
):
"""
Scale each feature by its maximum absolute value.
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Columns to be processed.
"""
self.features = features
self.mas = None
def fit(self, df: pd.DataFrame):
"""
Fit the MaxAbsScale model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.mas = MaxAbsScaler()
self.mas.fit(df[self.features])
def transform(self, df: pd.DataFrame):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[self.features] = self.mas.transform(new_df[self.features])
return new_df
@ -117,18 +216,40 @@ class MaxAbsScale(MLProcess):
@register_tool(tool_type=TOOL_TYPE)
class RobustScale(MLProcess):
def __init__(
self,
features: list,
):
"""
Apply the RobustScaler to scale features using statistics that are robust to outliers.
"""
def __init__(self, features: list):
"""
Initialize the RobustScale instance with feature names.
Args:
features (list): List of feature names to be scaled.
"""
self.features = features
self.rs = None
def fit(self, df: pd.DataFrame):
"""
Compute the median and IQR for scaling.
Args:
df (pd.DataFrame): Dataframe containing the features.
"""
self.rs = RobustScaler()
self.rs.fit(df[self.features])
def transform(self, df: pd.DataFrame):
"""
Scale features using the previously computed median and IQR.
Args:
df (pd.DataFrame): Dataframe containing the features to be scaled.
Returns:
pd.DataFrame: A new dataframe with scaled features.
"""
new_df = df.copy()
new_df[self.features] = self.rs.transform(new_df[self.features])
return new_df
@ -136,18 +257,40 @@ class RobustScale(MLProcess):
@register_tool(tool_type=TOOL_TYPE)
class OrdinalEncode(MLProcess):
def __init__(
self,
features: list,
):
"""
Encode categorical features as ordinal integers.
"""
def __init__(self, features: list):
"""
Initialize the OrdinalEncode instance with feature names.
Args:
features (list): List of categorical feature names to be encoded.
"""
self.features = features
self.oe = None
def fit(self, df: pd.DataFrame):
"""
Learn the ordinal encodings for the features.
Args:
df (pd.DataFrame): Dataframe containing the categorical features.
"""
self.oe = OrdinalEncoder()
self.oe.fit(df[self.features])
def transform(self, df: pd.DataFrame):
"""
Convert the categorical features to ordinal integers.
Args:
df (pd.DataFrame): Dataframe containing the categorical features to be encoded.
Returns:
pd.DataFrame: A new dataframe with the encoded features.
"""
new_df = df.copy()
new_df[self.features] = self.oe.transform(new_df[self.features])
return new_df
@ -155,18 +298,40 @@ class OrdinalEncode(MLProcess):
@register_tool(tool_type=TOOL_TYPE)
class OneHotEncode(MLProcess):
def __init__(
self,
features: list,
):
"""
Apply one-hot encoding to specified categorical columns, the original columns will be dropped.
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Categorical columns to be one-hot encoded and dropped.
"""
self.features = features
self.ohe = None
def fit(self, df: pd.DataFrame):
"""
Fit the OneHotEncoding model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
self.ohe.fit(df[self.features])
def transform(self, df: pd.DataFrame):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
ts_data = self.ohe.transform(df[self.features])
new_columns = self.ohe.get_feature_names_out(self.features)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
@ -177,21 +342,43 @@ class OneHotEncode(MLProcess):
@register_tool(tool_type=TOOL_TYPE)
class LabelEncode(MLProcess):
def __init__(
self,
features: list,
):
"""
Apply label encoding to specified categorical columns in-place.
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Categorical columns to be label encoded.
"""
self.features = features
self.le_encoders = []
def fit(self, df: pd.DataFrame):
"""
Fit the LabelEncode model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
if len(self.features) == 0:
return
for col in self.features:
le = LabelEncoder().fit(df[col].astype(str).unique().tolist() + ["unknown"])
self.le_encoders.append(le)
def transform(self, df: pd.DataFrame):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
if len(self.features) == 0:
return df
new_df = df.copy()
@ -204,8 +391,17 @@ class LabelEncode(MLProcess):
return new_df
@register_tool(tool_type=TOOL_TYPE)
def get_column_info(df: pd.DataFrame) -> dict:
"""
Analyzes a DataFrame and categorizes its columns based on data types.
Args:
df (pd.DataFrame): The DataFrame to be analyzed.
Returns:
dict: A dictionary with four keys ('Category', 'Numeric', 'Datetime', 'Others').
Each key corresponds to a list of column names belonging to that category.
"""
column_info = {
"Category": [],
"Numeric": [],

View file

@ -184,7 +184,7 @@ class SplitBins(MLProcess):
return new_df
@register_tool(tool_type=TOOL_TYPE)
# @register_tool(tool_type=TOOL_TYPE)
class ExtractTimeComps(MLProcess):
def __init__(self, time_col: str, time_comps: list):
self.time_col = time_col
@ -242,6 +242,7 @@ class GeneralSelection(MLProcess):
# skip for now because lgb is needed
# @register_tool(tool_type=TOOL_TYPE)
class TreeBasedSelection(MLProcess):
def __init__(self, label_col: str, task_type: str):
self.label_col = label_col

View file

@ -0,0 +1,46 @@
OrdinalEncode:
type: class
description: Encode categorical features as ordinal integers.
methods:
__init__:
description: 'Initialize the OrdinalEncode instance with feature names. '
parameters:
properties:
features:
type: list
description: List of categorical feature names to be encoded.
required:
- features
fit:
description: 'Learn the ordinal encodings for the features. '
parameters:
properties:
df:
type: pd.DataFrame
description: Dataframe containing the categorical features.
required:
- df
fit_transform:
description: 'Fit and transform the input DataFrame. '
parameters:
properties:
df:
type: pd.DataFrame
description: The input DataFrame.
required:
- df
returns:
- type: pd.DataFrame
description: The transformed DataFrame.
transform:
description: 'Convert the categorical features to ordinal integers. '
parameters:
properties:
df:
type: pd.DataFrame
description: Dataframe containing the categorical features to be encoded.
required:
- df
returns:
- type: pd.DataFrame
description: A new dataframe with the encoded features.

View file

@ -0,0 +1,47 @@
RobustScale:
type: class
description: Apply the RobustScaler to scale features using statistics that are
robust to outliers.
methods:
__init__:
description: 'Initialize the RobustScale instance with feature names. '
parameters:
properties:
features:
type: list
description: List of feature names to be scaled.
required:
- features
fit:
description: 'Compute the median and IQR for scaling. '
parameters:
properties:
df:
type: pd.DataFrame
description: Dataframe containing the features.
required:
- df
fit_transform:
description: 'Fit and transform the input DataFrame. '
parameters:
properties:
df:
type: pd.DataFrame
description: The input DataFrame.
required:
- df
returns:
- type: pd.DataFrame
description: The transformed DataFrame.
transform:
description: 'Scale features using the previously computed median and IQR. '
parameters:
properties:
df:
type: pd.DataFrame
description: Dataframe containing the features to be scaled.
required:
- df
returns:
- type: pd.DataFrame
description: A new dataframe with scaled features.

View file

@ -0,0 +1,85 @@
import inspect
import re
def remove_spaces(text):
return re.sub(r"\s+", " ", text)
def convert_code_to_tool_schema(obj, include: list[str] = []):
docstring = inspect.getdoc(obj)
assert docstring, "no docstring found for the objects, skip registering"
if inspect.isclass(obj):
schema = {"type": "class", "description": remove_spaces(docstring), "methods": {}}
for name, method in inspect.getmembers(obj, inspect.isfunction):
if include and name not in include:
continue
method_doc = inspect.getdoc(method)
if method_doc:
schema["methods"][name] = docstring_to_schema(method_doc)
elif inspect.isfunction(obj):
schema = {
"type": "function",
**docstring_to_schema(docstring),
}
schema = {obj.__name__: schema}
return schema
def docstring_to_schema(docstring: str):
if docstring is None:
return {}
# 匹配简介部分
description_match = re.search(r"^(.*?)(?:Args:|Returns:|Raises:|$)", docstring, re.DOTALL)
description = remove_spaces(description_match.group(1)) if description_match else ""
# 匹配Args部分
args_match = re.search(r"Args:\s*(.*?)(?:Returns:|Raises:|$)", docstring, re.DOTALL)
_args = args_match.group(1).strip() if args_match else ""
# variable_pattern = re.compile(r"(\w+)\s*\((.*?)\):\s*(.*)")
variable_pattern = re.compile(
r"(\w+)\s*\((.*?)\):\s*(.*?)(?=\n\s*\w+\s*\(|\Z)", re.DOTALL
) # (?=\n\w+\s*\(|\Z) isb to assert that what follows is either the start of the next parameter (indicated by a newline, some word characters, and an opening parenthesis) or the end of the string (\Z).
params = variable_pattern.findall(_args)
parameter_schema = {"properties": {}, "required": []}
for param in params:
param_name, param_type, param_desc = param
# check required or optional
if "optional" in param_type:
param_type = param_type.replace(", optional", "")
else:
parameter_schema["required"].append(param_name)
# type and desc
param_dict = {"type": param_type, "description": remove_spaces(param_desc)}
# match Default for optional args
default_val = re.search(r"Defaults to (.+?)\.", param_desc)
if default_val:
param_dict["default"] = default_val.group(1)
# match Enum
enum_val = re.search(r"Enum: \[(.+?)\]", param_desc)
if enum_val:
param_dict["enum"] = [e.strip() for e in enum_val.group(1).split(",")]
# add to parameter schema
parameter_schema["properties"].update({param_name: param_dict})
# 匹配Returns部分
returns_match = re.search(r"Returns:\s*(.*?)(?:Raises:|$)", docstring, re.DOTALL)
returns = returns_match.group(1).strip() if returns_match else ""
return_pattern = re.compile(r"^(.*)\s*:\s*(.*)$")
returns = return_pattern.findall(returns)
# 构建YAML字典
schema = {
"description": description,
"parameters": parameter_schema,
}
if returns:
schema["returns"] = [{"type": ret[0], "description": remove_spaces(ret[1])} for ret in returns]
return schema

View file

@ -14,6 +14,7 @@ import yaml
from metagpt.const import TOOL_SCHEMA_PATH
from metagpt.logs import logger
from metagpt.tools.tool_convert import convert_code_to_tool_schema
from metagpt.tools.tool_data_type import Tool, ToolSchema, ToolType
@ -34,7 +35,9 @@ class ToolRegistry:
schema_path=None,
tool_code="",
tool_type="other",
make_schema_if_not_exists=False,
tool_source_object=None,
include_functions=[],
make_schema_if_not_exists=True,
):
if self.has_tool(tool_name):
return
@ -44,14 +47,16 @@ class ToolRegistry:
if not os.path.exists(schema_path):
if make_schema_if_not_exists:
logger.warning(f"no schema found, will make schema at {schema_path}")
make_schema(tool_code, schema_path)
schema_dict = make_schema(tool_source_object, include_functions, schema_path)
else:
logger.warning(f"no schema found at assumed schema_path {schema_path}, skip registering {tool_name}")
return
with open(schema_path, "r", encoding="utf-8") as f:
schema_dict = yaml.safe_load(f)
schemas = schema_dict.get(tool_name) or list(schema_dict.values())[0]
else:
with open(schema_path, "r", encoding="utf-8") as f:
schema_dict = yaml.safe_load(f)
if not schema_dict:
return
schemas = schema_dict.get(tool_name) or list(schema_dict.values())[0]
schemas["tool_path"] = tool_path # corresponding code file path of the tool
try:
ToolSchema(**schemas) # validation
@ -94,7 +99,7 @@ def register_tool_type(cls):
return cls
def register_tool(tool_name="", tool_type="other", schema_path=None):
def register_tool(tool_name="", tool_type="other", schema_path=None, **kwargs):
"""register a tool to registry"""
def decorator(cls, tool_name=tool_name):
@ -112,15 +117,27 @@ def register_tool(tool_name="", tool_type="other", schema_path=None):
schema_path=schema_path,
tool_code=source_code,
tool_type=tool_type,
tool_source_object=cls,
**kwargs,
)
return cls
return decorator
def make_schema(tool_code, path):
def make_schema(tool_source_object, include, path):
os.makedirs(os.path.dirname(path), exist_ok=True) # Create the necessary directories
schema = {} # an empty schema for now
with open(path, "w", encoding="utf-8") as f:
yaml.dump(schema, f)
return path
try:
schema = convert_code_to_tool_schema(tool_source_object, include=include)
with open(path, "w", encoding="utf-8") as f:
yaml.dump(schema, f, sort_keys=False)
# import json
# with open(str(path).replace("yml", "json"), "w", encoding="utf-8") as f:
# json.dump(schema, f, ensure_ascii=False, indent=4)
logger.info(f"schema made at {path}")
except Exception as e:
schema = {}
logger.error("Fail to make schema")
print(e)
return schema