add tool registry

This commit is contained in:
yzlin 2024-01-13 01:28:49 +08:00
parent 224bf820b2
commit 46cd219e81
25 changed files with 1582 additions and 59 deletions

View file

@ -8,11 +8,9 @@ import re
from pathlib import Path
from typing import Dict, List, Tuple, Union
import yaml
from tenacity import retry, stop_after_attempt, wait_fixed
from metagpt.actions import Action
from metagpt.const import TOOL_SCHEMA_PATH
from metagpt.llm import LLM
from metagpt.logs import logger
from metagpt.prompts.ml_engineer import (
@ -24,12 +22,9 @@ from metagpt.prompts.ml_engineer import (
TOOL_USAGE_PROMPT,
)
from metagpt.schema import Message, Plan
from metagpt.tools import TOOL_TYPE_MAPPINGS
from metagpt.tools.tool_registry import TOOL_REGISTRY
from metagpt.utils.common import create_func_config, remove_comments
TOOL_TYPE_MODULE = {k: v.module for k, v in TOOL_TYPE_MAPPINGS.items()}
TOOL_TYPE_USAGE_PROMPT = {k: v.usage_prompt for k, v in TOOL_TYPE_MAPPINGS.items()}
class BaseWriteAnalysisCode(Action):
DEFAULT_SYSTEM_MSG: str = """You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**""" # prompt reference: https://github.com/KillianLucas/open-interpreter/blob/v0.1.4/interpreter/system_message.txt
@ -95,49 +90,27 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):
class WriteCodeWithTools(BaseWriteAnalysisCode):
"""Write code with help of local available tools. Choose tools first, then generate code to use the tools"""
schema_path: Union[Path, str] = TOOL_SCHEMA_PATH
available_tools: dict = {}
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._load_tools(self.schema_path)
def _load_tools(self, schema_path, schema_module=None):
"""Load tools from yaml file"""
if isinstance(schema_path, dict):
schema_module = schema_module or "udf"
self.available_tools.update({schema_module: schema_path})
else:
if isinstance(schema_path, list):
yml_files = schema_path
elif isinstance(schema_path, Path) and schema_path.is_file():
yml_files = [schema_path]
else:
yml_files = schema_path.glob("*.yml")
for yml_file in yml_files:
module = yml_file.stem
with open(yml_file, "r", encoding="utf-8") as f:
self.available_tools[module] = yaml.safe_load(f)
def _parse_recommend_tools(self, module: str, recommend_tools: list) -> dict:
def _parse_recommend_tools(self, recommend_tools: list) -> dict:
"""
Parses and validates a list of recommended tools, and retrieves their schema from registry.
Args:
module (str): The module name for querying tools in the registry.
recommend_tools (list): A list of recommended tools.
Returns:
dict: A dict of valid tool schemas.
"""
valid_tools = []
available_tools = self.available_tools[module].keys()
for tool in recommend_tools:
if tool in available_tools:
valid_tools.append(tool)
for tool_name in recommend_tools:
if TOOL_REGISTRY.has_tool(tool_name):
valid_tools.append(TOOL_REGISTRY.get_tool(tool_name))
tool_catalog = {tool: self.available_tools[module][tool] for tool in valid_tools}
tool_catalog = {tool.name: tool.schema for tool in valid_tools}
return tool_catalog
async def _tool_recommendation(
@ -176,8 +149,10 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
tool_type = (
plan.current_task.task_type
) # find tool type from task type through exact match, can extend to retrieval in the future
available_tools = self.available_tools.get(tool_type, {})
special_prompt = TOOL_TYPE_USAGE_PROMPT.get(tool_type, "")
available_tools = TOOL_REGISTRY.get_tools_by_type(tool_type)
special_prompt = (
TOOL_REGISTRY.get_tool_type(tool_type).usage_prompt if TOOL_REGISTRY.has_tool_type(tool_type) else ""
)
code_steps = plan.current_task.code_steps
finished_tasks = plan.get_finished_tasks()
@ -185,22 +160,17 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
code_context = "\n\n".join(code_context)
tool_catalog = {}
module_name = ""
if len(available_tools) > 0:
available_tools = {k: v["description"] for k, v in available_tools.items()}
if available_tools:
available_tools = {tool_name: tool.schema["description"] for tool_name, tool in available_tools.items()}
recommend_tools = await self._tool_recommendation(
plan.current_task.instruction, code_steps, available_tools
)
tool_catalog = self._parse_recommend_tools(tool_type, recommend_tools)
tool_catalog = self._parse_recommend_tools(recommend_tools)
logger.info(f"Recommended tools: \n{recommend_tools}")
module_name = TOOL_TYPE_MODULE[tool_type]
tools_instruction = TOOL_USAGE_PROMPT.format(
special_prompt=special_prompt, module_name=module_name, tool_catalog=tool_catalog
)
tools_instruction = TOOL_USAGE_PROMPT.format(special_prompt=special_prompt, tool_catalog=tool_catalog)
context.append(Message(content=tools_instruction, role="user"))

View file

@ -12,7 +12,7 @@ from metagpt.actions import Action
from metagpt.logs import logger
from metagpt.prompts.ml_engineer import ASSIGN_TASK_TYPE_CONFIG, ASSIGN_TASK_TYPE_PROMPT
from metagpt.schema import Message, Plan, Task
from metagpt.tools import TOOL_TYPE_MAPPINGS
from metagpt.tools import TOOL_REGISTRY
from metagpt.utils.common import CodeParser, create_func_config
@ -47,13 +47,16 @@ class WritePlan(Action):
List[Dict]: tasks with task type assigned
"""
task_list = "\n".join([f"Task {task['task_id']}: {task['instruction']}" for task in tasks])
task_type_desc = "\n".join([f"- **{item.name}**: {item.desc}" for item in TOOL_TYPE_MAPPINGS.values()])
task_type_desc = "\n".join(
[f"- **{tool_type.name}**: {tool_type.desc}" for tool_type in TOOL_REGISTRY.get_tool_types().values()]
) # task type are binded with tool type now, should be improved in the future
prompt = ASSIGN_TASK_TYPE_PROMPT.format(
task_list=task_list, task_type_desc=task_type_desc
) # task types are set to be the same as tool types, for now
tool_config = create_func_config(ASSIGN_TASK_TYPE_CONFIG)
rsp = await self.llm.aask_code(prompt, **tool_config)
task_type_list = rsp["task_type"]
print(f"assigned task types: {task_type_list}")
for task, task_type in zip(tasks, task_type_list):
task["task_type"] = task_type
return json.dumps(tasks)

View file

@ -203,7 +203,7 @@ Specifically, {special_prompt}
- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..
# Available Tools (can be empty):
Each Class tool is described in JSON format. When you call a tool, import the tool from `{module_name}` first.
Each Class tool is described in JSON format. When you call a tool, import the tool first.
{tool_catalog}
# Constraints:

View file

@ -8,17 +8,6 @@
from enum import Enum
from pydantic import BaseModel
from metagpt.const import TOOL_LIBS_PATH
from metagpt.prompts.tool_type import (
DATA_PREPROCESS_PROMPT,
FEATURE_ENGINEERING_PROMPT,
MODEL_TRAIN_PROMPT,
MODEL_EVALUATE_PROMPT,
VISION_PROMPT,
)
class SearchEngineType(Enum):
SERPAPI_GOOGLE = "serpapi"

View file

@ -14,8 +14,13 @@ from sklearn.preprocessing import (
)
from metagpt.tools.functions.libs.base import MLProcess
from metagpt.tools.tool_registry import register_tool
from metagpt.tools.tool_schema import ToolTypeEnum
TOOL_TYPE = ToolTypeEnum.DATA_PREPROCESS.value
@register_tool(tool_type_name=TOOL_TYPE)
class FillMissingValue(MLProcess):
def __init__(
self,
@ -42,6 +47,7 @@ class FillMissingValue(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class MinMaxScale(MLProcess):
def __init__(
self,
@ -60,6 +66,7 @@ class MinMaxScale(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class StandardScale(MLProcess):
def __init__(
self,
@ -78,6 +85,7 @@ class StandardScale(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class MaxAbsScale(MLProcess):
def __init__(
self,
@ -96,6 +104,7 @@ class MaxAbsScale(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class RobustScale(MLProcess):
def __init__(
self,
@ -114,6 +123,7 @@ class RobustScale(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class OrdinalEncode(MLProcess):
def __init__(
self,
@ -132,6 +142,7 @@ class OrdinalEncode(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class OneHotEncode(MLProcess):
def __init__(
self,
@ -153,6 +164,7 @@ class OneHotEncode(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class LabelEncode(MLProcess):
def __init__(
self,
@ -181,6 +193,7 @@ class LabelEncode(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
def get_column_info(df: pd.DataFrame) -> dict:
column_info = {
"Category": [],

View file

@ -6,7 +6,7 @@
# @Desc : Feature Engineering Tools
import itertools
import lightgbm as lgb
# import lightgbm as lgb
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
@ -16,8 +16,13 @@ from sklearn.model_selection import KFold
from sklearn.preprocessing import KBinsDiscretizer, PolynomialFeatures
from metagpt.tools.functions.libs.base import MLProcess
from metagpt.tools.tool_registry import register_tool
from metagpt.tools.tool_schema import ToolTypeEnum
TOOL_TYPE = ToolTypeEnum.FEATURE_ENGINEERING.value
@register_tool(tool_type_name=TOOL_TYPE)
class PolynomialExpansion(MLProcess):
def __init__(self, cols: list, degree: int = 2, label_col: str = None):
self.cols = cols
@ -48,6 +53,7 @@ class PolynomialExpansion(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class CatCount(MLProcess):
def __init__(self, col: str):
self.col = col
@ -62,6 +68,7 @@ class CatCount(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class TargetMeanEncoder(MLProcess):
def __init__(self, col: str, label: str):
self.col = col
@ -77,6 +84,7 @@ class TargetMeanEncoder(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class KFoldTargetMeanEncoder(MLProcess):
def __init__(self, col: str, label: str, n_splits: int = 5, random_state: int = 2021):
self.col = col
@ -103,6 +111,7 @@ class KFoldTargetMeanEncoder(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class CatCross(MLProcess):
def __init__(self, cols: list, max_cat_num: int = 100):
self.cols = cols
@ -138,6 +147,7 @@ class CatCross(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class GroupStat(MLProcess):
def __init__(self, group_col: str, agg_col: str, agg_funcs: list):
self.group_col = group_col
@ -157,6 +167,7 @@ class GroupStat(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class SplitBins(MLProcess):
def __init__(self, cols: list, strategy: str = "quantile"):
self.cols = cols
@ -173,6 +184,7 @@ class SplitBins(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class ExtractTimeComps(MLProcess):
def __init__(self, time_col: str, time_comps: list):
self.time_col = time_col
@ -201,6 +213,7 @@ class ExtractTimeComps(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class GeneralSelection(MLProcess):
def __init__(self, label_col: str):
self.label_col = label_col
@ -228,6 +241,7 @@ class GeneralSelection(MLProcess):
return new_df
# skip for now because lgb is needed
class TreeBasedSelection(MLProcess):
def __init__(self, label_col: str, task_type: str):
self.label_col = label_col
@ -270,6 +284,7 @@ class TreeBasedSelection(MLProcess):
return new_df
@register_tool(tool_type_name=TOOL_TYPE)
class VarianceBasedSelection(MLProcess):
def __init__(self, label_col: str, threshold: float = 0):
self.label_col = label_col

View file

@ -0,0 +1,61 @@
FillMissingValue:
type: class
description: "Completing missing values with simple strategies"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
strategy:
type: str
description: "the imputation strategy, notice mean/median can only be used for numeric features"
default: mean
enum:
- mean
- median
- most_frequent
- constant
fill_value:
type: int
description: "fill_value is used to replace all occurrences of missing_values"
default: null
required:
- features
fit:
description: "Fit the FillMissingValue model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
LabelEncode:
type: class
description: "Apply label encoding to specified categorical columns in-place."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "Categorical columns to be label encoded"
required:
- features
fit:
description: "Fit the LabelEncode model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
MaxAbsScale:
type: class
description: "cale each feature by its maximum absolute value"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
required:
- features
fit:
description: "Fit the MaxAbsScale model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
MinMaxScale:
type: class
description: "Transform features by scaling each feature to a range, witch is (0, 1)"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
required:
- features
fit:
description: "Fit the MinMaxScale model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
OneHotEncode:
type: class
description: "Apply one-hot encoding to specified categorical columns, the original columns will be dropped."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "Categorical columns to be one-hot encoded and dropped"
required:
- features
fit:
description: "Fit the OneHotEncoding model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
StandardScale:
type: class
description: "Standardize features by removing the mean and scaling to unit variance"
methods:
__init__:
description: "Initialize self."
parameters:
properties:
features:
type: list
description: "columns to be processed"
required:
- features
fit:
description: "Fit the StandardScale model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
CatCount:
type: class
description: "Add value counts of a categorical column as new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column for value counts."
required:
- col
fit:
description: "Fit the CatCount model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,52 @@
CatCross:
type: class
description: "Add pairwise crossed features and convert them to numerical features."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be pairwise crossed, at least 2 columns."
max_cat_num:
type: int
description: "Maximum unique categories per crossed feature."
default: 100
required:
- cols
fit:
description: "Fit the CatCross model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,48 @@
GeneralSelection:
type: class
description: "Drop all nan feats and feats with only one unique value."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
required:
- label_col
fit:
description: "Fit the GeneralSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,58 @@
GroupStat:
type: class
description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
group_col:
type: str
description: "Column used for grouping."
agg_col:
type: str
description: "Column on which aggregation is performed."
agg_funcs:
type: list
description: >-
List of aggregation functions to apply, such as ['mean', 'std'].
Each function must be supported by pandas.
required:
- group_col
- agg_col
- agg_funcs
fit:
description: "Fit the GroupStat model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,60 @@
KFoldTargetMeanEncoder:
type: class
description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be k-fold mean encoded."
label:
type: str
description: "Predicted label column."
n_splits:
type: int
description: "Number of splits for K-fold."
default: 5
random_state:
type: int
description: "Random seed."
default: 2021
required:
- col
- label
fit:
description: "Fit the KFoldTargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,548 @@
PolynomialExpansion:
type: class
description: "Add polynomial and interaction features from selected numeric columns to input DataFrame."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns for polynomial expansion."
label_col:
type: str
description: "Label column name."
degree:
type: int
description: "The degree of the polynomial features."
default: 2
required:
- cols
- label_col
fit:
description: "Fit the PolynomialExpansion model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame without duplicated columns."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame without duplicated columns."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
CatCount:
type: class
description: "Add value counts of a categorical column as new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column for value counts."
required:
- col
fit:
description: "Fit the CatCount model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
TargetMeanEncoder:
type: class
description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be mean encoded."
label:
type: str
description: "Predicted label column."
required:
- col
- label
fit:
description: "Fit the TargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
KFoldTargetMeanEncoder:
type: class
description: "Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be k-fold mean encoded."
label:
type: str
description: "Predicted label column."
n_splits:
type: int
description: "Number of splits for K-fold."
default: 5
random_state:
type: int
description: "Random seed."
default: 2021
required:
- col
- label
fit:
description: "Fit the KFoldTargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
CatCross:
type: class
description: "Add pairwise crossed features and convert them to numerical features."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be pairwise crossed, at least 2 columns."
max_cat_num:
type: int
description: "Maximum unique categories per crossed feature."
default: 100
required:
- cols
fit:
description: "Fit the CatCross model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
GroupStat:
type: class
description: "Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
group_col:
type: str
description: "Column used for grouping."
agg_col:
type: str
description: "Column on which aggregation is performed."
agg_funcs:
type: list
description: >-
List of aggregation functions to apply, such as ['mean', 'std'].
Each function must be supported by pandas.
required:
- group_col
- agg_col
- agg_funcs
fit:
description: "Fit the GroupStat model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
SplitBins:
type: class
description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be binned inplace."
strategy:
type: str
description: "Strategy used to define the widths of the bins."
default: quantile
enum:
- quantile
- uniform
- kmeans
required:
- cols
fit:
description: "Fit the SplitBins model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
GeneralSelection:
type: class
description: "Drop all nan feats and feats with only one unique value."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
required:
- label_col
fit:
description: "Fit the GeneralSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
TreeBasedSelection:
type: class
description: "Select features based on tree-based model and remove features with low importance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
task_type:
type: str
description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
enum:
- cls
- mcls
- reg
required:
- label_col
- task_type
fit:
description: "Fit the TreeBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
VarianceBasedSelection:
type: class
description: "Select features based on variance and remove features with low variance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
threshold:
type: float
description: "Threshold for variance."
default: 0.0
required:
- label_col
fit:
description: "Fit the VarianceBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."

View file

@ -0,0 +1,56 @@
SplitBins:
type: class
description: "Inplace binning of continuous data into intervals, returning integer-encoded bin identifiers directly."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
cols:
type: list
description: "Columns to be binned inplace."
strategy:
type: str
description: "Strategy used to define the widths of the bins."
default: quantile
enum:
- quantile
- uniform
- kmeans
required:
- cols
fit:
description: "Fit the SplitBins model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,52 @@
TargetMeanEncoder:
type: class
description: "Encodes a categorical column by the mean of the label column, and adds the result as a new feature."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
col:
type: str
description: "Column to be mean encoded."
label:
type: str
description: "Predicted label column."
required:
- col
- label
fit:
description: "Fit the TargetMeanEncoder model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame."

View file

@ -0,0 +1,56 @@
TreeBasedSelection:
type: class
description: "Select features based on tree-based model and remove features with low importance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
task_type:
type: str
description: "Task type, 'cls' for classification, 'mcls' for multi-class classification, 'reg' for regression."
enum:
- cls
- mcls
- reg
required:
- label_col
- task_type
fit:
description: "Fit the TreeBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."

View file

@ -0,0 +1,52 @@
VarianceBasedSelection:
type: class
description: "Select features based on variance and remove features with low variance."
methods:
__init__:
description: "Initialize self."
parameters:
properties:
label_col:
type: str
description: "Label column name."
threshold:
type: float
description: "Threshold for variance."
default: 0.0
required:
- label_col
fit:
description: "Fit the VarianceBasedSelection model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
transform:
description: "Transform the input DataFrame with the fitted model."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."
fit_transform:
description: "Fit and transform the input DataFrame."
parameters:
properties:
df:
type: DataFrame
description: "The input DataFrame."
required:
- df
returns:
df:
type: DataFrame
description: "The transformed DataFrame contain label_col."

View file

@ -0,0 +1,128 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2023/01/12 17:07
@Author : garylin2099
@File : tool_registry.py
"""
import os
from collections import defaultdict
import inspect
import re
import yaml
from metagpt.tools.tool_schema import ToolType, ToolSchema, Tool
from metagpt.logs import logger
from metagpt.const import TOOL_SCHEMA_PATH
class ToolRegistry:
def __init__(self):
self.tools = {}
self.tool_types = {}
self.tools_by_types = defaultdict(
dict
) # two-layer k-v, {tool_type_name: {tool_name: {...}, ...}, ...}
def register_tool_type(self, tool_type: ToolType):
self.tool_types[tool_type.name] = tool_type
def register_tool(
self,
tool_name,
tool_path,
schema_path=None,
tool_code="",
tool_type_name="other",
make_schema_if_not_exists=False,
):
if self.has_tool(tool_name):
return
schema_path = schema_path or TOOL_SCHEMA_PATH / tool_type_name / f"{tool_name}.yml"
if not os.path.exists(schema_path):
if make_schema_if_not_exists:
logger.warning(f"no schema found, will make schema at {schema_path}")
make_schema(tool_code, schema_path)
else:
logger.warning(f"no schema found at assumed schema_path {schema_path}, skip registering {tool_name}")
return
with open(schema_path, "r", encoding="utf-8") as f:
schema = yaml.safe_load(f)[tool_name]
schema["tool_path"] = tool_path # corresponding code file path of the tool
try:
ToolSchema(**schema) # validation
except Exception as e:
pass
# logger.warning(
# f"{tool_name} schema not conforms to required format, but will be used anyway. Mismatch: {e}"
# )
tool = Tool(name=tool_name, path=tool_path, schema=schema, code=tool_code)
self.tools[tool_name] = tool
self.tools_by_types[tool_type_name][tool_name] = tool
logger.info(f"{tool_name} registered")
def has_tool(self, key):
return key in self.tools
def get_tool(self, key):
return self.tools.get(key)
def get_tools_by_type(self, key):
return self.tools_by_types.get(key)
def has_tool_type(self, key):
return key in self.tool_types
def get_tool_type(self, key):
return self.tool_types.get(key)
def get_tool_types(self):
return self.tool_types
# Registry instance
TOOL_REGISTRY = ToolRegistry()
def register_tool_type(cls):
"""register a tool type to registry"""
TOOL_REGISTRY.register_tool_type(tool_type=cls())
return cls
def register_tool(tool_name="", tool_type_name="other", schema_path=None):
"""register a tool to registry"""
def decorator(cls, tool_name=tool_name):
tool_name = tool_name or cls.__name__
# Get the file path where the function / class is defined and the source code
file_path = inspect.getfile(cls)
if "metagpt" in file_path:
file_path = re.search("metagpt.+", file_path).group(0)
source_code = inspect.getsource(cls)
TOOL_REGISTRY.register_tool(
tool_name=tool_name,
tool_path=file_path,
schema_path=schema_path,
tool_code=source_code,
tool_type_name=tool_type_name,
)
return cls
return decorator
def make_schema(tool_code, path):
os.makedirs(
os.path.dirname(path), exist_ok=True
) # Create the necessary directories
schema = {} # an empty schema for now
with open(path, "w", encoding="utf-8") as f:
yaml.dump(schema, f)
return path

View file

@ -0,0 +1,31 @@
from enum import Enum
from pydantic import BaseModel
class ToolTypeEnum(Enum):
DATA_PREPROCESS = "data_preprocess"
FEATURE_ENGINEERING = "feature_engineering"
MODEL_TRAIN = "model_train"
MODEL_EVALUATE = "model_evaluate"
OTHER = "other"
def __missing__(self, key):
return self.OTHER
class ToolType(BaseModel):
name: str
desc: str
usage_prompt: str = ""
class ToolSchema(BaseModel):
name: str
class Tool(BaseModel):
name: str
path: str
schema: dict = {}
code: str = ""

View file

@ -0,0 +1,43 @@
from metagpt.prompts.tool_type import (
DATA_PREPROCESS_PROMPT,
FEATURE_ENGINEERING_PROMPT,
MODEL_TRAIN_PROMPT,
MODEL_EVALUATE_PROMPT,
)
from metagpt.tools.tool_schema import ToolTypeEnum, ToolType
from metagpt.tools.tool_registry import register_tool_type
@register_tool_type
class DataPreprocess(ToolType):
name: str = ToolTypeEnum.DATA_PREPROCESS.value
desc: str = "Only for changing value inplace."
usage_prompt: str = DATA_PREPROCESS_PROMPT
@register_tool_type
class FeatureEngineer(ToolType):
name: str = ToolTypeEnum.FEATURE_ENGINEERING.value
desc: str = "Only for creating new columns for input data."
usage_prompt: str = FEATURE_ENGINEERING_PROMPT
@register_tool_type
class ModelTrain(ToolType):
name: str = ToolTypeEnum.MODEL_TRAIN.value
desc: str = "Only for training model."
usage_prompt: str = MODEL_TRAIN_PROMPT
@register_tool_type
class ModelEvaluate(ToolType):
name: str = ToolTypeEnum.MODEL_EVALUATE.value
desc: str = "Only for evaluating model."
usage_prompt: str = MODEL_EVALUATE_PROMPT
@register_tool_type
class Other(ToolType):
name: str = ToolTypeEnum.OTHER.value
desc: str = "Any tools not in the defined categories"
usage_prompt: str = ""