Merge branch 'simplify_tools' into 'code_intepreter'

reduce ml libs redundancy

See merge request agents/data_agents_opt!76
This commit is contained in:
林义章 2024-02-04 15:30:04 +00:00
commit 9af2508654
27 changed files with 206 additions and 585 deletions

View file

@ -5,7 +5,7 @@
@File : crawl_webpage.py
"""
from metagpt.roles.code_interpreter import CodeInterpreter
from metagpt.roles.ci.code_interpreter import CodeInterpreter
async def main():

View file

@ -5,7 +5,7 @@
@Author : mannaandpoem
@File : imitate_webpage.py
"""
from metagpt.roles.code_interpreter import CodeInterpreter
from metagpt.roles.ci.code_interpreter import CodeInterpreter
async def main():

View file

@ -4,7 +4,7 @@
# @Desc :
import asyncio
from metagpt.roles.code_interpreter import CodeInterpreter
from metagpt.roles.ci.code_interpreter import CodeInterpreter
async def main(requirement: str = ""):

View file

@ -25,7 +25,6 @@ class WriteCodeWithToolsML(WriteCodeWithTools):
tool_schemas, tool_type_usage_prompt = await self._prepare_tools(plan=plan)
# ML-specific variables to be used in prompt
code_steps = plan.current_task.code_steps
finished_tasks = plan.get_finished_tasks()
code_context = [remove_comments(task.code) for task in finished_tasks]
code_context = "\n\n".join(code_context)
@ -38,7 +37,6 @@ class WriteCodeWithToolsML(WriteCodeWithTools):
current_task=plan.current_task.instruction,
column_info=column_info,
tool_type_usage_prompt=tool_type_usage_prompt,
code_steps=code_steps,
tool_schemas=tool_schemas,
)
@ -49,7 +47,6 @@ class WriteCodeWithToolsML(WriteCodeWithTools):
current_task=plan.current_task.instruction,
column_info=column_info,
tool_type_usage_prompt=tool_type_usage_prompt,
code_steps=code_steps,
)
tool_config = create_func_call_config(CODE_GENERATOR_WITH_TOOLS)
rsp = await self.llm.aask_code(prompt, **tool_config)

View file

@ -79,7 +79,6 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
async def _recommend_tool(
self,
task: str,
code_steps: str,
available_tools: dict,
) -> list:
"""
@ -87,7 +86,6 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
Args:
task (str): the task to recommend tools for
code_steps (str): the code steps to generate the full code for the task
available_tools (dict): the available tools description
Returns:
@ -95,7 +93,6 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
"""
prompt = TOOL_RECOMMENDATION_PROMPT.format(
current_task=task,
code_steps=code_steps,
available_tools=available_tools,
)
tool_config = create_func_call_config(SELECT_FUNCTION_TOOLS)
@ -132,8 +129,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
available_tools = self._get_tools_by_type(tool_type)
if available_tools:
available_tools = {tool_name: tool.schemas["description"] for tool_name, tool in available_tools.items()}
code_steps = plan.current_task.code_steps
tool_schemas = await self._recommend_tool(plan.current_task.instruction, code_steps, available_tools)
tool_schemas = await self._recommend_tool(plan.current_task.instruction, available_tools)
return tool_schemas, tool_type_usage_prompt

View file

@ -84,15 +84,11 @@ Latest data info after previous tasks:
Write complete code for 'Current Task'. And avoid duplicating code from 'Done Tasks', such as repeated import of packages, reading data, etc.
Specifically, {tool_type_usage_prompt}
# Code Steps:
Strictly follow steps below when you writing code if it's convenient.
{code_steps}
# Output Example:
when current task is "train a lightgbm model on training data", and their are two steps in 'Code Steps', the code be like:
when current task is "train a lightgbm model on training data", the code can be like:
```python
# Step 1: check data type and convert to numeric
ojb_cols = train.select_dtypes(include='object').columns.tolist()
obj_cols = train.select_dtypes(include='object').columns.tolist()
for col in obj_cols:
encoder = LabelEncoder()
@ -107,7 +103,6 @@ model.fit(train, y_train)
# Constraints:
- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.
- The output code should contain all steps implemented in 'Code Steps'.
"""
ML_TOOL_USAGE_PROMPT = """
@ -130,10 +125,6 @@ Latest data info after previous tasks:
Write complete code for 'Current Task'. And avoid duplicating code from 'Done Tasks', such as repeated import of packages, reading data, etc.
Specifically, {tool_type_usage_prompt}
# Code Steps:
Strictly follow steps below when you writing code if it's convenient.
{code_steps}
# Capabilities
- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python Class.
- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..
@ -143,7 +134,7 @@ Each Class tool is described in JSON format. When you call a tool, import the to
{tool_schemas}
# Output Example:
when current task is "do data preprocess, like fill missing value, handle outliers, etc.", and their are two steps in 'Code Steps', the code be like:
when current task is "do data preprocess, like fill missing value, handle outliers, etc.", the code can be like:
```python
# Step 1: fill missing value
# Tools used: ['FillMissingValue']
@ -170,6 +161,4 @@ for col in num_cols:
- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.
- Always prioritize using pre-defined tools for the same functionality.
- Always copy the DataFrame before processing it and use the copy to process.
- The output code should contain all steps implemented correctly in 'Code Steps'.
"""
# - If 'Code Steps' contains step done in 'Done Tasks', such as reading data, don't repeat it.

View file

@ -30,8 +30,6 @@ TOOL_RECOMMENDATION_PROMPT = """
## Task
Recommend up to five tools from 'Available Tools' that can help solve the 'User Requirement'.
This is a detailed code steps for current task. You can refer to it when recommending tools.
{code_steps}
## Available Tools:
{available_tools}

View file

@ -14,10 +14,10 @@ The current task is about data preprocessing, please note the following:
FEATURE_ENGINEERING_PROMPT = """
The current task is about feature engineering. when performing it, please adhere to the following principles:
- Generate as diverse features as possible to improve the model's performance step-by-step.
- If potential impactful features are not included in 'Code Steps', add new steps to generate them.
- Use available feature engineering tools if they are potential impactful.
- Avoid creating redundant or excessively numerous features in one step.
- Exclude ID columns from feature generation and remove them.
- Each step do feature engineering to train, must do same for test separately at the same time.
- Each feature engineering operation performed on the train set must also applies to the test separately at the same time.
- Avoid using the label column to create features, except for cat encoding.
- Use the data from previous task result if exist, do not mock or reload data yourself.
"""

View file

@ -2,7 +2,7 @@ from metagpt.actions.ci.debug_code import DebugCode
from metagpt.actions.ci.execute_nb_code import ExecuteNbCode
from metagpt.actions.ci.ml_action import UpdateDataColumns, WriteCodeWithToolsML
from metagpt.logs import logger
from metagpt.roles.code_interpreter import CodeInterpreter
from metagpt.roles.ci.code_interpreter import CodeInterpreter
from metagpt.tools.tool_types import ToolTypes
from metagpt.utils.common import any_to_str

View file

@ -33,9 +33,9 @@ from metagpt.actions.add_requirement import UserRequirement
from metagpt.context_mixin import ContextMixin
from metagpt.logs import logger
from metagpt.memory import Memory
from metagpt.plan.planner import Planner
from metagpt.provider import HumanProvider
from metagpt.schema import Message, MessageQueue, SerializationMixin
from metagpt.strategy.planner import Planner
from metagpt.utils.common import any_to_name, any_to_str, role_raise_decorator
from metagpt.utils.project_repo import ProjectRepo
from metagpt.utils.repair_llm_raw_output import extract_state_value_from_output

View file

@ -335,7 +335,6 @@ class Task(BaseModel):
dependent_task_ids: list[str] = [] # Tasks prerequisite to this Task
instruction: str = ""
task_type: str = ""
code_steps: str = ""
code: str = ""
result: str = ""
is_success: bool = False
@ -348,7 +347,6 @@ class Task(BaseModel):
self.is_finished = False
def update_task_result(self, task_result: TaskResult):
self.code_steps = task_result.code_steps
self.code = task_result.code
self.result = task_result.result
self.is_success = task_result.is_success
@ -357,7 +355,6 @@ class Task(BaseModel):
class TaskResult(BaseModel):
"""Result of taking a task, with result and is_success required to be filled"""
code_steps: str = ""
code: str = ""
result: str
is_success: bool

View file

@ -124,11 +124,6 @@ class Planner(BaseModel):
def get_useful_memories(self, task_exclude_field=None) -> list[Message]:
"""find useful memories only to reduce context length and improve performance"""
# TODO dataset description , code steps
if task_exclude_field is None:
# Shorten the context as we don't need code steps after we get the codes.
# This doesn't affect current_task below, which should hold the code steps
task_exclude_field = {"code_steps"}
user_requirement = self.plan.goal
context = self.plan.context
tasks = [task.dict(exclude=task_exclude_field) for task in self.plan.tasks]

View file

@ -7,11 +7,10 @@
"""
from enum import Enum
from metagpt.tools import tool_types # this registers all tool types
from metagpt.tools import libs # this registers all tools
from metagpt.tools.tool_registry import TOOL_REGISTRY
_ = tool_types, libs, TOOL_REGISTRY # Avoid pre-commit error
_ = libs, TOOL_REGISTRY # Avoid pre-commit error
class SearchEngineType(Enum):

View file

@ -19,14 +19,29 @@ from metagpt.tools.tool_types import ToolTypes
TOOL_TYPE = ToolTypes.DATA_PREPROCESS.type_name
class MLProcess(object):
def fit(self, df):
class MLProcess:
def fit(self, df: pd.DataFrame):
"""
Fit a model to be used in subsequent transform.
Args:
df (pd.DataFrame): The input DataFrame.
"""
raise NotImplementedError
def transform(self, df):
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
raise NotImplementedError
def fit_transform(self, df) -> pd.DataFrame:
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Fit and transform the input DataFrame.
@ -40,8 +55,36 @@ class MLProcess(object):
return self.transform(df)
class DataPreprocessTool(MLProcess):
"""
Completing a data preprocessing operation.
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Columns to be processed.
"""
self.features = features
self.model = None # to be filled by specific subclass Tool
def fit(self, df: pd.DataFrame):
if len(self.features) == 0:
return
self.model.fit(df[self.features])
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
if len(self.features) == 0:
return df
new_df = df.copy()
new_df[self.features] = self.model.transform(new_df[self.features])
return new_df
@register_tool(tool_type=TOOL_TYPE)
class FillMissingValue(MLProcess):
class FillMissingValue(DataPreprocessTool):
"""
Completing missing values with simple strategies.
"""
@ -58,282 +101,77 @@ class FillMissingValue(MLProcess):
Defaults to None.
"""
self.features = features
self.strategy = strategy
self.fill_value = fill_value
self.si = None
def fit(self, df: pd.DataFrame):
"""
Fit the FillMissingValue model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
if len(self.features) == 0:
return
self.si = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
self.si.fit(df[self.features])
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
if len(self.features) == 0:
return df
new_df = df.copy()
new_df[self.features] = self.si.transform(new_df[self.features])
return new_df
self.model = SimpleImputer(strategy=strategy, fill_value=fill_value)
@register_tool(tool_type=TOOL_TYPE)
class MinMaxScale(MLProcess):
class MinMaxScale(DataPreprocessTool):
"""
Transform features by scaling each feature to a range, which is (0, 1).
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Columns to be processed.
"""
self.features = features
self.mms = None
def fit(self, df: pd.DataFrame):
"""
Fit the MinMaxScale model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.mms = MinMaxScaler()
self.mms.fit(df[self.features])
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[self.features] = self.mms.transform(new_df[self.features])
return new_df
self.model = MinMaxScaler()
@register_tool(tool_type=TOOL_TYPE)
class StandardScale(MLProcess):
class StandardScale(DataPreprocessTool):
"""
Standardize features by removing the mean and scaling to unit variance.
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Columns to be processed.
"""
self.features = features
self.ss = None
def fit(self, df: pd.DataFrame):
"""
Fit the StandardScale model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.ss = StandardScaler()
self.ss.fit(df[self.features])
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[self.features] = self.ss.transform(new_df[self.features])
return new_df
self.model = StandardScaler()
@register_tool(tool_type=TOOL_TYPE)
class MaxAbsScale(MLProcess):
class MaxAbsScale(DataPreprocessTool):
"""
Scale each feature by its maximum absolute value.
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Columns to be processed.
"""
self.features = features
self.mas = None
def fit(self, df: pd.DataFrame):
"""
Fit the MaxAbsScale model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.mas = MaxAbsScaler()
self.mas.fit(df[self.features])
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[self.features] = self.mas.transform(new_df[self.features])
return new_df
self.model = MaxAbsScaler()
@register_tool(tool_type=TOOL_TYPE)
class RobustScale(MLProcess):
class RobustScale(DataPreprocessTool):
"""
Apply the RobustScaler to scale features using statistics that are robust to outliers.
"""
def __init__(self, features: list):
"""
Initialize the RobustScale instance with feature names.
Args:
features (list): List of feature names to be scaled.
"""
self.features = features
self.rs = None
def fit(self, df: pd.DataFrame):
"""
Compute the median and IQR for scaling.
Args:
df (pd.DataFrame): Dataframe containing the features.
"""
self.rs = RobustScaler()
self.rs.fit(df[self.features])
def transform(self, df: pd.DataFrame):
"""
Scale features using the previously computed median and IQR.
Args:
df (pd.DataFrame): Dataframe containing the features to be scaled.
Returns:
pd.DataFrame: A new dataframe with scaled features.
"""
new_df = df.copy()
new_df[self.features] = self.rs.transform(new_df[self.features])
return new_df
self.model = RobustScaler()
@register_tool(tool_type=TOOL_TYPE)
class OrdinalEncode(MLProcess):
class OrdinalEncode(DataPreprocessTool):
"""
Encode categorical features as ordinal integers.
"""
def __init__(self, features: list):
"""
Initialize the OrdinalEncode instance with feature names.
Args:
features (list): List of categorical feature names to be encoded.
"""
self.features = features
self.oe = None
def fit(self, df: pd.DataFrame):
"""
Learn the ordinal encodings for the features.
Args:
df (pd.DataFrame): Dataframe containing the categorical features.
"""
self.oe = OrdinalEncoder()
self.oe.fit(df[self.features])
def transform(self, df: pd.DataFrame):
"""
Convert the categorical features to ordinal integers.
Args:
df (pd.DataFrame): Dataframe containing the categorical features to be encoded.
Returns:
pd.DataFrame: A new dataframe with the encoded features.
"""
new_df = df.copy()
new_df[self.features] = self.oe.transform(new_df[self.features])
return new_df
self.model = OrdinalEncoder()
@register_tool(tool_type=TOOL_TYPE)
class OneHotEncode(MLProcess):
class OneHotEncode(DataPreprocessTool):
"""
Apply one-hot encoding to specified categorical columns, the original columns will be dropped.
"""
def __init__(self, features: list):
"""
Initialize self.
Args:
features (list): Categorical columns to be one-hot encoded and dropped.
"""
self.features = features
self.ohe = None
def fit(self, df: pd.DataFrame):
"""
Fit the OneHotEncoding model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
self.ohe.fit(df[self.features])
self.model = OneHotEncoder(handle_unknown="ignore", sparse=False)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
ts_data = self.ohe.transform(df[self.features])
new_columns = self.ohe.get_feature_names_out(self.features)
ts_data = self.model.transform(df[self.features])
new_columns = self.model.get_feature_names_out(self.features)
ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
new_df = df.drop(self.features, axis=1)
new_df = pd.concat([new_df, ts_data], axis=1)
@ -341,7 +179,7 @@ class OneHotEncode(MLProcess):
@register_tool(tool_type=TOOL_TYPE)
class LabelEncode(MLProcess):
class LabelEncode(DataPreprocessTool):
"""
Apply label encoding to specified categorical columns in-place.
"""
@ -357,12 +195,6 @@ class LabelEncode(MLProcess):
self.le_encoders = []
def fit(self, df: pd.DataFrame):
"""
Fit the LabelEncode model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
if len(self.features) == 0:
return
for col in self.features:
@ -370,15 +202,6 @@ class LabelEncode(MLProcess):
self.le_encoders.append(le)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
if len(self.features) == 0:
return df
new_df = df.copy()

View file

@ -45,12 +45,6 @@ class PolynomialExpansion(MLProcess):
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
def fit(self, df: pd.DataFrame):
"""
Fit the PolynomialExpansion model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
if len(self.cols) == 0:
return
if len(self.cols) > 10:
@ -61,15 +55,6 @@ class PolynomialExpansion(MLProcess):
self.poly.fit(df[self.cols].fillna(0))
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame without duplicated columns.
"""
if len(self.cols) == 0:
return df
ts_data = self.poly.transform(df[self.cols].fillna(0))
@ -97,24 +82,9 @@ class CatCount(MLProcess):
self.encoder_dict = None
def fit(self, df: pd.DataFrame):
"""
Fit the CatCount model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.encoder_dict = df[self.col].value_counts().to_dict()
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[f"{self.col}_cnt"] = new_df[self.col].map(self.encoder_dict)
return new_df
@ -139,24 +109,9 @@ class TargetMeanEncoder(MLProcess):
self.encoder_dict = None
def fit(self, df: pd.DataFrame):
"""
Fit the TargetMeanEncoder model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.encoder_dict = df.groupby(self.col)[self.label].mean().to_dict()
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[f"{self.col}_target_mean"] = new_df[self.col].map(self.encoder_dict)
return new_df
@ -185,12 +140,6 @@ class KFoldTargetMeanEncoder(MLProcess):
self.encoder_dict = None
def fit(self, df: pd.DataFrame):
"""
Fit the KFoldTargetMeanEncoder model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
tmp = df.copy()
kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
@ -203,15 +152,6 @@ class KFoldTargetMeanEncoder(MLProcess):
self.encoder_dict = tmp.groupby(self.col)[col_name].mean().to_dict()
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[f"{self.col}_kf_target_mean"] = new_df[self.col].map(self.encoder_dict)
return new_df
@ -255,12 +195,6 @@ class CatCross(MLProcess):
return new_col, comb_map
def fit(self, df: pd.DataFrame):
"""
Fit the CatCross model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
for col in self.cols:
if df[col].nunique() > self.max_cat_num:
self.cols.remove(col)
@ -269,15 +203,6 @@ class CatCross(MLProcess):
self.combs_map = dict(res)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
for comb in self.combs:
new_col = f"{comb[0]}_{comb[1]}"
@ -310,12 +235,6 @@ class GroupStat(MLProcess):
self.group_df = None
def fit(self, df: pd.DataFrame):
"""
Fit the GroupStat model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
group_df = df.groupby(self.group_col)[self.agg_col].agg(self.agg_funcs).reset_index()
group_df.columns = [self.group_col] + [
f"{self.agg_col}_{agg_func}_by_{self.group_col}" for agg_func in self.agg_funcs
@ -323,15 +242,6 @@ class GroupStat(MLProcess):
self.group_df = group_df
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.merge(self.group_df, on=self.group_col, how="left")
return new_df
@ -355,25 +265,10 @@ class SplitBins(MLProcess):
self.encoder = None
def fit(self, df: pd.DataFrame):
"""
Fit the SplitBins model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
self.encoder = KBinsDiscretizer(strategy=self.strategy, encode="ordinal")
self.encoder.fit(df[self.cols].fillna(0))
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
new_df = df.copy()
new_df[self.cols] = self.encoder.transform(new_df[self.cols].fillna(0))
return new_df
@ -397,24 +292,9 @@ class ExtractTimeComps(MLProcess):
self.time_comps = time_comps
def fit(self, df: pd.DataFrame):
"""
Fit the ExtractTimeComps model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
pass
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
time_s = pd.to_datetime(df[self.time_col], errors="coerce")
time_comps_df = pd.DataFrame()
@ -445,12 +325,6 @@ class GeneralSelection(MLProcess):
self.feats = []
def fit(self, df: pd.DataFrame):
"""
Fit the GeneralSelection model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
feats = [f for f in df.columns if f != self.label_col]
for col in df.columns:
if df[col].isnull().sum() / df.shape[0] == 1:
@ -468,15 +342,6 @@ class GeneralSelection(MLProcess):
self.feats = feats
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame contain label_col.
"""
new_df = df[self.feats + [self.label_col]]
return new_df
@ -501,12 +366,6 @@ class TreeBasedSelection(MLProcess):
self.feats = None
def fit(self, df: pd.DataFrame):
"""
Fit the TreeBasedSelection model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
params = {
"boosting_type": "gbdt",
"objective": "binary",
@ -538,15 +397,6 @@ class TreeBasedSelection(MLProcess):
self.feats.append(self.label_col)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame contain label_col.
"""
new_df = df[self.feats]
return new_df
@ -571,12 +421,6 @@ class VarianceBasedSelection(MLProcess):
self.selector = VarianceThreshold(threshold=self.threshold)
def fit(self, df: pd.DataFrame):
"""
Fit the VarianceBasedSelection model.
Args:
df (pd.DataFrame): The input DataFrame.
"""
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cols = [f for f in num_cols if f not in [self.label_col]]
@ -585,14 +429,5 @@ class VarianceBasedSelection(MLProcess):
self.feats.append(self.label_col)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input DataFrame with the fitted model.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The transformed DataFrame contain label_col.
"""
new_df = df[self.feats]
return new_df

View file

@ -12,7 +12,8 @@ def convert_code_to_tool_schema(obj, include: list[str] = []):
for name, method in inspect.getmembers(obj, inspect.isfunction):
if include and name not in include:
continue
method_doc = inspect.getdoc(method)
# method_doc = inspect.getdoc(method)
method_doc = get_class_method_docstring(obj, name)
if method_doc:
schema["methods"][name] = docstring_to_schema(method_doc)
@ -22,8 +23,6 @@ def convert_code_to_tool_schema(obj, include: list[str] = []):
**docstring_to_schema(docstring),
}
schema = {obj.__name__: schema}
return schema
@ -70,3 +69,13 @@ def docstring_to_schema(docstring: str):
schema["returns"] = [{"type": ret[0], "description": remove_spaces(ret[1])} for ret in returns]
return schema
def get_class_method_docstring(cls, method_name):
"""Retrieve a method's docstring, searching the class hierarchy if necessary."""
for base_class in cls.__mro__:
if method_name in base_class.__dict__:
method = base_class.__dict__[method_name]
if method.__doc__:
return method.__doc__
return None # No docstring found in the class hierarchy

View file

@ -39,7 +39,6 @@ class ToolRegistry(BaseModel):
tool_type="other",
tool_source_object=None,
include_functions=[],
make_schema_if_not_exists=True,
verbose=False,
):
if self.has_tool(tool_name):
@ -57,19 +56,11 @@ class ToolRegistry(BaseModel):
schema_path = schema_path or TOOL_SCHEMA_PATH / tool_type / f"{tool_name}.yml"
if not os.path.exists(schema_path):
if make_schema_if_not_exists:
logger.warning(f"no schema found, will make schema at {schema_path}")
schema_dict = make_schema(tool_source_object, include_functions, schema_path)
else:
logger.warning(f"no schema found at assumed schema_path {schema_path}, skip registering {tool_name}")
return
else:
with open(schema_path, "r", encoding="utf-8") as f:
schema_dict = yaml.safe_load(f)
if not schema_dict:
schemas = make_schema(tool_source_object, include_functions, schema_path)
if not schemas:
return
schemas = schema_dict.get(tool_name) or list(schema_dict.values())[0]
schemas["tool_path"] = tool_path # corresponding code file path of the tool
try:
ToolSchema(**schemas) # validation
@ -78,11 +69,13 @@ class ToolRegistry(BaseModel):
# logger.warning(
# f"{tool_name} schema not conforms to required format, but will be used anyway. Mismatch: {e}"
# )
tool = Tool(name=tool_name, path=tool_path, schemas=schemas, code=tool_code)
self.tools[tool_name] = tool
self.tools_by_types[tool_type][tool_name] = tool
if verbose:
logger.info(f"{tool_name} registered")
logger.info(f"schema made at {str(schema_path)}, can be used for checking")
def has_tool(self, key: str) -> Tool:
return key in self.tools
@ -107,12 +100,10 @@ class ToolRegistry(BaseModel):
TOOL_REGISTRY = ToolRegistry(tool_types=ToolTypes)
def register_tool(tool_name: str = "", tool_type: str = "other", schema_path: str = "", **kwargs):
def register_tool(tool_type: str = "other", schema_path: str = "", **kwargs):
"""register a tool to registry"""
def decorator(cls, tool_name=tool_name):
tool_name = tool_name or cls.__name__
def decorator(cls):
# Get the file path where the function / class is defined and the source code
file_path = inspect.getfile(cls)
if "metagpt" in file_path:
@ -120,7 +111,7 @@ def register_tool(tool_name: str = "", tool_type: str = "other", schema_path: st
source_code = inspect.getsource(cls)
TOOL_REGISTRY.register_tool(
tool_name=tool_name,
tool_name=cls.__name__,
tool_path=file_path,
schema_path=schema_path,
tool_code=source_code,
@ -142,7 +133,6 @@ def make_schema(tool_source_object, include, path):
# import json
# with open(str(path).replace("yml", "json"), "w", encoding="utf-8") as f:
# json.dump(schema, f, ensure_ascii=False, indent=4)
logger.info(f"schema made at {path}")
except Exception as e:
schema = {}
logger.error(f"Fail to make schema: {e}")

View file

@ -5,7 +5,7 @@ from pydantic import BaseModel
def remove_spaces(text):
return re.sub(r"\s+", " ", text)
return re.sub(r"\s+", " ", text).strip()
class DocstringParser(BaseModel):

File diff suppressed because one or more lines are too long

View file

@ -8,8 +8,8 @@ from metagpt.actions.ci.write_analysis_code import (
WriteCodeWithTools,
)
from metagpt.logs import logger
from metagpt.plan.planner import STRUCTURAL_CONTEXT
from metagpt.schema import Message, Plan, Task
from metagpt.strategy.planner import STRUCTURAL_CONTEXT
@pytest.mark.skip
@ -37,13 +37,12 @@ async def test_write_code_by_list_plan():
@pytest.mark.asyncio
async def test_tool_recommendation():
task = "clean and preprocess the data"
code_steps = ""
available_tools = {
"FillMissingValue": "Filling missing values",
"SplitBins": "Bin continuous data into intervals and return the bin identifier encoded as an integer value",
}
write_code = WriteCodeWithTools()
tools = await write_code._recommend_tool(task, code_steps, available_tools)
tools = await write_code._recommend_tool(task, available_tools)
assert len(tools) == 1
assert "FillMissingValue" in tools

View file

@ -1,7 +1,7 @@
import pytest
from metagpt.logs import logger
from metagpt.roles.code_interpreter import CodeInterpreter
from metagpt.roles.ci.code_interpreter import CodeInterpreter
@pytest.mark.asyncio

View file

@ -2,7 +2,7 @@ import pytest
from metagpt.actions.ci.execute_nb_code import ExecuteNbCode
from metagpt.logs import logger
from metagpt.roles.ml_engineer import MLEngineer
from metagpt.roles.ci.ml_engineer import MLEngineer
from metagpt.schema import Message, Plan, Task
from metagpt.tools.tool_types import ToolTypes
from tests.metagpt.actions.ci.test_debug_code import CODE, DebugContext, ErrorStr
@ -22,7 +22,6 @@ MockPlan = Plan(
dependent_task_ids=[],
instruction="Perform exploratory data analysis on the train dataset to understand the features and target variable.",
task_type="eda",
code_steps="",
code="",
result="",
is_success=False,
@ -35,7 +34,6 @@ MockPlan = Plan(
dependent_task_ids=[],
instruction="Perform exploratory data analysis on the train dataset to understand the features and target variable.",
task_type="eda",
code_steps="",
code="",
result="",
is_success=False,

View file

@ -17,7 +17,7 @@ def test_docstring_to_schema():
pd.DataFrame: The transformed DataFrame.
"""
expected = {
"description": " Some test desc. ",
"description": "Some test desc.",
"parameters": {
"properties": {
"features": {"type": "list", "description": "Columns to be processed."},
@ -97,47 +97,45 @@ def dummy_fn(df: pd.DataFrame) -> dict:
def test_convert_code_to_tool_schema_class():
expected = {
"DummyClass": {
"type": "class",
"description": "Completing missing values with simple strategies.",
"methods": {
"__init__": {
"description": "Initialize self. ",
"parameters": {
"properties": {
"features": {"type": "list", "description": "Columns to be processed."},
"strategy": {
"type": "str",
"description": "The imputation strategy, notice 'mean' and 'median' can only be used for numeric features. Enum: ['mean', 'median', 'most_frequent', 'constant']. Defaults to 'mean'.",
"default": "'mean'",
"enum": ["'mean'", "'median'", "'most_frequent'", "'constant'"],
},
"fill_value": {
"type": "int",
"description": "Fill_value is used to replace all occurrences of missing_values. Defaults to None.",
"default": "None",
},
"type": "class",
"description": "Completing missing values with simple strategies.",
"methods": {
"__init__": {
"description": "Initialize self.",
"parameters": {
"properties": {
"features": {"type": "list", "description": "Columns to be processed."},
"strategy": {
"type": "str",
"description": "The imputation strategy, notice 'mean' and 'median' can only be used for numeric features. Enum: ['mean', 'median', 'most_frequent', 'constant']. Defaults to 'mean'.",
"default": "'mean'",
"enum": ["'mean'", "'median'", "'most_frequent'", "'constant'"],
},
"fill_value": {
"type": "int",
"description": "Fill_value is used to replace all occurrences of missing_values. Defaults to None.",
"default": "None",
},
"required": ["features"],
},
},
"fit": {
"description": "Fit the FillMissingValue model. ",
"parameters": {
"properties": {"df": {"type": "pd.DataFrame", "description": "The input DataFrame."}},
"required": ["df"],
},
},
"transform": {
"description": "Transform the input DataFrame with the fitted model. ",
"parameters": {
"properties": {"df": {"type": "pd.DataFrame", "description": "The input DataFrame."}},
"required": ["df"],
},
"returns": [{"type": "pd.DataFrame", "description": "The transformed DataFrame."}],
"required": ["features"],
},
},
}
"fit": {
"description": "Fit the FillMissingValue model.",
"parameters": {
"properties": {"df": {"type": "pd.DataFrame", "description": "The input DataFrame."}},
"required": ["df"],
},
},
"transform": {
"description": "Transform the input DataFrame with the fitted model.",
"parameters": {
"properties": {"df": {"type": "pd.DataFrame", "description": "The input DataFrame."}},
"required": ["df"],
},
"returns": [{"type": "pd.DataFrame", "description": "The transformed DataFrame."}],
},
},
}
schema = convert_code_to_tool_schema(DummyClass)
assert schema == expected
@ -145,14 +143,12 @@ def test_convert_code_to_tool_schema_class():
def test_convert_code_to_tool_schema_function():
expected = {
"dummy_fn": {
"type": "function",
"description": "Analyzes a DataFrame and categorizes its columns based on data types. ",
"parameters": {
"properties": {"df": {"type": "pd.DataFrame", "description": "The DataFrame to be analyzed."}},
"required": ["df"],
},
}
"type": "function",
"description": "Analyzes a DataFrame and categorizes its columns based on data types.",
"parameters": {
"properties": {"df": {"type": "pd.DataFrame", "description": "The DataFrame to be analyzed."}},
"required": ["df"],
},
}
schema = convert_code_to_tool_schema(dummy_fn)
assert schema == expected

View file

@ -14,18 +14,6 @@ def tool_registry_full():
return ToolRegistry(tool_types=ToolTypes)
@pytest.fixture
def schema_yaml(mocker):
mock_yaml_content = """
tool_name:
key1: value1
key2: value2
"""
mocker.patch("os.path.exists", return_value=True)
mocker.patch("builtins.open", mocker.mock_open(read_data=mock_yaml_content))
return mocker
# Test Initialization
def test_initialization(tool_registry):
assert isinstance(tool_registry, ToolRegistry)
@ -42,33 +30,46 @@ def test_initialize_with_tool_types(tool_registry_full):
assert "data_preprocess" in tool_registry_full.tool_types
# Test Tool Registration
def test_register_tool(tool_registry, schema_yaml):
tool_registry.register_tool("TestTool", "/path/to/tool")
assert "TestTool" in tool_registry.tools
class TestClassTool:
"""test class"""
def test_class_fn(self):
"""test class fn"""
pass
# Test Tool Registration with Non-existing Schema
def test_register_tool_no_schema(tool_registry, mocker):
mocker.patch("os.path.exists", return_value=False)
tool_registry.register_tool("TestTool", "/path/to/tool")
assert "TestTool" not in tool_registry.tools
def test_fn():
"""test function"""
pass
# Test Tool Registration Class
def test_register_tool_class(tool_registry):
tool_registry.register_tool("TestClassTool", "/path/to/tool", tool_source_object=TestClassTool)
assert "TestClassTool" in tool_registry.tools
# Test Tool Registration Function
def test_register_tool_fn(tool_registry):
tool_registry.register_tool("test_fn", "/path/to/tool", tool_source_object=test_fn)
assert "test_fn" in tool_registry.tools
# Test Tool Existence Checks
def test_has_tool(tool_registry, schema_yaml):
tool_registry.register_tool("TestTool", "/path/to/tool")
assert tool_registry.has_tool("TestTool")
def test_has_tool(tool_registry):
tool_registry.register_tool("TestClassTool", "/path/to/tool", tool_source_object=TestClassTool)
assert tool_registry.has_tool("TestClassTool")
assert not tool_registry.has_tool("NonexistentTool")
# Test Tool Retrieval
def test_get_tool(tool_registry, schema_yaml):
tool_registry.register_tool("TestTool", "/path/to/tool")
tool = tool_registry.get_tool("TestTool")
def test_get_tool(tool_registry):
tool_registry.register_tool("TestClassTool", "/path/to/tool", tool_source_object=TestClassTool)
tool = tool_registry.get_tool("TestClassTool")
assert tool is not None
assert tool.name == "TestTool"
assert tool.name == "TestClassTool"
assert tool.path == "/path/to/tool"
assert "description" in tool.schemas
# Similar tests for has_tool_type, get_tool_type, get_tools_by_type
@ -83,12 +84,12 @@ def test_get_tool_type(tool_registry_full):
assert retrieved_type.name == "data_preprocess"
def test_get_tools_by_type(tool_registry, schema_yaml):
def test_get_tools_by_type(tool_registry):
tool_type_name = "TestType"
tool_name = "TestTool"
tool_path = "/path/to/tool"
tool_registry.register_tool(tool_name, tool_path, tool_type=tool_type_name)
tool_registry.register_tool(tool_name, tool_path, tool_type=tool_type_name, tool_source_object=TestClassTool)
tools_by_type = tool_registry.get_tools_by_type(tool_type_name)
assert tools_by_type is not None

View file

@ -14,7 +14,7 @@ from metagpt.utils.save_code import DATA_PATH, save_code_file
def test_save_code_file_python():
save_code_file("example", "print('Hello, World!')")
file_path = DATA_PATH / "output" / "example" / "code.py"
assert file_path.exists, f"File does not exist: {file_path}"
assert file_path.exists(), f"File does not exist: {file_path}"
content = file_path.read_text()
assert "print('Hello, World!')" in content, "File content does not match"
@ -35,7 +35,7 @@ async def test_save_code_file_notebook():
# Save as a Notebook file
save_code_file("example_nb", executor.nb, file_format="ipynb")
file_path = DATA_PATH / "output" / "example_nb" / "code.ipynb"
assert file_path.exists, f"Notebook file does not exist: {file_path}"
assert file_path.exists(), f"Notebook file does not exist: {file_path}"
# Additional checks specific to notebook format
notebook = nbformat.read(file_path, as_version=4)