Merge branch 'dev_tool_selection' of https://gitlab.deepwisdomai.com/agents/data_agents_opt into dev_tool_selection

Resolve conflicts
2026-05-03 12:52:37 +02:00 · 2023-12-08 13:51:00 +08:00 · 2023-12-08 13:51:00 +08:00 · b9d12f17ae
commit b9d12f17ae
parent 13e2b05812 fe2b79fedc
8 changed files with 249 additions and 136 deletions
--- a/metagpt/actions/write_analysis_code.py
+++ b/metagpt/actions/write_analysis_code.py
@ -12,21 +12,15 @@ from metagpt.prompts.ml_engineer import (
    TOOL_RECOMMENDATION_PROMPT,
    SELECT_FUNCTION_TOOLS,
    CODE_GENERATOR_WITH_TOOLS,
-    TOOL_ORGANIZATION_PROMPT,
+    TOOL_USAGE_PROMPT,
    ML_SPECIFIC_PROMPT,
    ML_MODULE_MAP,
-    TOOL_OUTPUT_DESC,
-    TOOL_USAGE_PROMPT,
+    TOOL_OUTPUT_DESC, DATA_PROCESS_PROMPT,
+    GENERATE_CODE_PROMPT
 )
 from metagpt.schema import Message, Plan
 from metagpt.tools.functions import registry
-from metagpt.utils.common import create_func_config
-from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT, GENERATE_CODE_PROMPT
-
-from metagpt.actions.execute_code import ExecutePyCode
-
-
-
+from metagpt.utils.common import create_func_config, remove_comments


 class BaseWriteAnalysisCode(Action):
@ -83,8 +77,6 @@ class BaseWriteAnalysisCode(Action):
        """


-
-
 class WriteCodeByGenerate(BaseWriteAnalysisCode):
    """Write code fully by generation"""

@ -107,7 +99,6 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):

 class WriteCodeWithTools(BaseWriteAnalysisCode):
    """Write code with help of local available tools. Choose tools first, then generate code to use the tools"""
-    execute_code = ExecutePyCode()

    @staticmethod
    def _parse_recommend_tools(module: str, recommend_tools: list) -> List[Dict]:
@ -132,7 +123,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):

    async def _tool_recommendation(
            self,
-            context: [List[Message]],
+            task: str,
            code_steps: str,
            available_tools: list
    ) -> list:
@ -147,12 +138,11 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
        Returns:
            list: recommended tools for the specified task
        """
-        system_prompt = TOOL_RECOMMENDATION_PROMPT.format(
+        prompt = TOOL_RECOMMENDATION_PROMPT.format(
+            current_task=task,
            code_steps=code_steps,
            available_tools=available_tools,
        )
-        prompt = self.process_msg(context, system_prompt)
-
        tool_config = create_func_config(SELECT_FUNCTION_TOOLS)
        rsp = await self.llm.aask_code(prompt, **tool_config)
        recommend_tools = rsp["recommend_tools"]
@ -163,17 +153,16 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
            context: List[Message],
            plan: Plan = None,
            code_steps: str = "",
+            column_info: str = "",
            **kwargs,
    ) -> str:
        task_type = plan.current_task.task_type
-        logger.info(f"task_type is: {task_type}")
        available_tools = registry.get_all_schema_by_module(task_type)
        special_prompt = ML_SPECIFIC_PROMPT.get(task_type, "")

        column_names = kwargs.get("column_names", {})
        finished_tasks = plan.get_finished_tasks()
-        code_context = [task.code for task in finished_tasks]
-
+        code_context = [remove_comments(task.code) for task in finished_tasks]
        code_context = "\n\n".join(code_context)

        if len(available_tools) > 0:
@ -182,17 +171,17 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
                for tool in available_tools
            ]

-            final_code = {}
-            new_code = ""
-            code_steps_dict = eval(code_steps)
-
-            recommend_tools = await self._tool_recommendation(context, code_steps, available_tools)
+            recommend_tools = await self._tool_recommendation(
+                plan.current_task.instruction,
+                code_steps,
+                available_tools
+            )
            tool_catalog = self._parse_recommend_tools(task_type, recommend_tools)
            logger.info(f"Recommended tools: \n{recommend_tools}")

            module_name = ML_MODULE_MAP[task_type]
            output_desc = TOOL_OUTPUT_DESC.get(task_type, "")
-
+            new_code = ""

            for idx, tool in enumerate(recommend_tools):
                hist_info = f"Previous finished code is \n\n ```Python {code_context} ``` \n\n "
--- a/metagpt/prompts/ml_engineer.py
+++ b/metagpt/prompts/ml_engineer.py
@ -8,19 +8,22 @@ GEN_DATA_DESC_PROMPT = """
 Here is the head 5 rows of the dataset:
 {data_head}

-Please provide a brief one-sentence background of the dataset, and concise descriptions for each column. Keep descriptions short yet informative.
+Please provide a brief one-sentence background of the dataset, and concise meaning for each column. Keep descriptions short.

 Output the information in a JSON format, as shown in this example:
 ```json
 {
    "data_desc": "Brief dataset background.",
    "column_desc": {
-        "column_name1": "Description of the first column.",
-        "column_name2": "Description of the second column.",
+        "column_name1": "Abstract meaning of the first column.",
+        "column_name2": "Abstract meaning of the second column.",
        ...
    }
 }
 ```
+
+# Constraints:
+- Don't contain specific values or examples found in the data column.
 """

 ASSIGN_TASK_TYPE_PROMPT = """
@ -53,19 +56,22 @@ ASSIGN_TASK_TYPE = {
 }

 TOOL_RECOMMENDATION_PROMPT = """
-Your are a tool recommender, the main goal is to recommend suitable tools for current task before coding. A tool means a function that can be used to help you solve the task.
+## User Requirement:
+{current_task}

-## List of Available Tools:
-{available_tools}
-
-This is a task guide for the current task, including detailed code steps. You can refer to it when recommending tools.
+## Task
+Recommend up to five tools from 'Available Tools' that can help solve the 'User Requirement'. 
+This is a detailed code steps for current task. You can refer to it when recommending tools.
 {code_steps}

+## Available Tools:
+{available_tools}
+
 ## Tool Selection and Instructions:
- For the task, choose up to five tools that are most likely to be useful in solving the task.
+- Select tools most relevant to completing the 'User Requirement'.
 - If you believe that no tools are suitable, indicate with an empty list.
 - Only list the names of the tools, not the full schema of each tool.
- The result should only contain tool names that are in the list of available tools.
+- Ensure selected tools are listed in 'Available Tools'.
 """

 SELECT_FUNCTION_TOOLS = {
@ -135,6 +141,7 @@ Make sure use the columns from the dataset columns: {column_names}
 Finish your coding tasks as a helpful programmer based on the tools.

 """
+
 GENERATE_CODE_PROMPT = """
 ## Target
 {goal}
@ -142,13 +149,17 @@ GENERATE_CODE_PROMPT = """
 Specifically, {special_prompt}


-## History Info
+## Finished Task and Code 
 {context}

 ## Code Steps for Current Task:
 Follow steps below when you writing code if it's convenient.
 {code_steps}

+## Instruction
+Finished task and code are executable, and you should based on the code to continue your current task
+Do not repeat functions and code, try to reuse the code in [Finished Task and Code]
+ 
 ## Your Output Format:
 Generate the complete code for this task:
 ```python
@ -162,6 +173,34 @@ Finish your coding tasks as a helpful programmer based on the code.

 """

+TOOL_USAGE_PROMPT = """
+## Target
+{goal}
+
+## History Info
+{context}
+
+## Available Tools:
+Each function is described in JSON format, including the function name and parameters. {output_desc}
+{function_catalog}
+
+When you call a function above, you should import the function from `{module_name}` first, e.g.:
+```python
+from metagpt.tools.functions.libs.data_preprocess import fill_missing_value
+```end
+
+## Your Output Format:
+Generate the complete code for this task:
+```python
+# Tools used: [function names or 'none']
+<your code for the current task, without any comments>
+```end
+
+## Attention:
+Make sure use the columns from the dataset columns
+Finish your coding tasks as a helpful programmer based on the tools.
+"""
+
 TOOL_ORGANIZATION_PROMPT = """
 The previous conversation has provided all tasks step-by-step for the use goal and their statuses. 
 Now, begin writing code for the current task. This code should writen strictly on the basis of all previous completed tasks code, not a standalone code. And avoid writing duplicate code that has already been written in previous tasks, such as repeated import of packages, reading data, etc.
@ -210,6 +249,66 @@ The current task is about feature engineering. when performing it, please adhere
 - Before generating a new feature, ensure the used features are already processed and ready to use.
 """

+DATA_PROCESS_PROMPT = """
+# Background
+As a data scientist, you need to help user to achieve the goal [{user_requirement}] step-by-step in an continuous Jupyter notebook.
+
+## Done Tasks
+```python
+{history_code}
+```end
+
+## Current Task
+{current_task}
+
+# Latest Data Info
+Latest data info after previous tasks:
+{column_info}
+
+# Task
+Write a Python function for 'Current Task'. Start by copying the input DataFrame. Avoid duplicating code from 'Done Tasks'.
+Specifically, {special_prompt}
+
+# Code Steps:
+Follow steps below when you writing code if it's convenient.
+{code_steps}
+
+# Capabilities
+- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of python functions.
+- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..
+- You can do anything about data preprocessing, feature engineering, model training, etc..
+
+# Available Tools:
+Each function tool is described in JSON format. {output_desc}
+When you call a function below, import the function from `{module_name}` first.
+{function_catalog}
+
+# Output Example:
+when current task is "fill missing value and handle outliers", the output code be like:
+```python
+from metagpt.tools.functions.libs.data_preprocess import fill_missing_value
+
+def function_name(df):
+    df_processed = df.copy()
+    num_cols = df_processed.select_dtypes(include='number').columns.tolist()
+    df_processed = fill_missing_value(df_processed, num_cols, 'mean')
+    
+    for col in num_cols:
+        low, high = df_processed[col].quantile([0.01, 0.99])
+        df_processed[col] = df_processed[col].clip(low, high)
+    return df_processed
+
+df_processed = function_name(df)
+print(df_processed.info())
+```end
+
+# Constraints:
+- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.
+- Prioritize using pre-defined tools for the same functionality.
+- Return DataFrame should always be named `df_processed`, while the input DataFrame should based on the done tasks' output DataFrame.
+- Limit to one print statement for the output DataFrame's info.
+"""
+
 MODEL_TRAIN_PROMPT = """
 The current task is about training a model, please ensure high performance:
 - Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as lightGBM, XGBoost, CatBoost, etc.
@ -217,9 +316,9 @@ The current task is about training a model, please ensure high performance:
 - Use the data from previous task result directly, do not mock or reload data yourself.
 """

-DATA_PREPROCESS_OUTPUT_DESC = "Please note that all functions uniformly output a processed pandas.DataFrame, facilitating seamless integration into the broader workflow."
+DATA_PREPROCESS_OUTPUT_DESC = "Please note that all functions output a updated pandas.DataFrame after data preprocessing."

-FEATURE_ENGINEERING_OUTPUT_DESC = "Please note that all functions uniformly output updated pandas.DataFrame with feature engineering applied."
+FEATURE_ENGINEERING_OUTPUT_DESC = "Please note that all functions output a updated pandas.DataFrame with new features added or existing features modified."

 CLASSIFICATION_MODEL_OUTPUT_DESC = ""

--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@ -1,21 +1,21 @@
-import glob
 import json
+import re
 from typing import List

 import fire
 import pandas as pd
-import re

 from metagpt.actions import Action
 from metagpt.actions.execute_code import ExecutePyCode
 from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
+from metagpt.actions.write_code_steps import WriteCodeSteps
 from metagpt.actions.write_plan import WritePlan
+from metagpt.const import DATA_PATH
 from metagpt.logs import logger
 from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT
 from metagpt.roles import Role
 from metagpt.schema import Message, Plan
 from metagpt.utils.common import CodeParser
-from metagpt.actions.write_code_steps import WriteCodeSteps
 from metagpt.actions.debug_code import DebugCode

 STRUCTURAL_CONTEXT = """
@ -74,32 +74,16 @@ def read_data(file: str) -> pd.DataFrame:
    return df


-def get_samples(df: pd.DataFrame) -> str:
+def get_column_info(df: pd.DataFrame) -> str:
    data = []
-
-    if len(df) > 5:
-        df_ = df.sample(5, random_state=0)
-    else:
-        df_ = df
-
-    for i in list(df_):
+    for i in df.columns:
        nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
        n_unique = df[i].nunique()
-        s = df_[i].tolist()
+        data.append([i, df[i].dtype, nan_freq, n_unique])

-        if str(df[i].dtype) == "float64":
-            s = [round(sample, 2) if not pd.isna(sample) else None for sample in s]
-
-        data.append([df_[i].name, df[i].dtype, nan_freq, n_unique, s])
    samples = pd.DataFrame(
        data,
-        columns=[
-            "Column_name",
-            "Data_type",
-            "NaN_Frequency(%)",
-            "N_unique",
-            "Samples",
-        ],
+        columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
    )
    return samples.to_string(index=False)

@ -128,20 +112,19 @@ class AskReview(Action):


 class GenerateDataDesc(Action):
-    async def run(self, files: list) -> dict:
+    async def run(self, file: str) -> dict:
        data_desc = {}
-        for file in files:
-            df = read_data(file)
-            file_name = file.split("/")[-1]
-            data_head = df.head().to_dict(orient="list")
-            data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
-            prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
-            rsp = await self._aask(prompt)
-            rsp = CodeParser.parse_code(block=None, text=rsp)
-            data_desc[file_name] = {}
-            data_desc[file_name]["path"] = file
-            data_desc[file_name]["description"] = rsp
-            data_desc[file_name]["column_info"] = get_samples(df)
+        df = read_data(file)
+        data_head = df.head().to_dict(orient="list")
+        data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
+        prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
+        rsp = await self._aask(prompt)
+        rsp = CodeParser.parse_code(block=None, text=rsp)
+        rsp = json.loads(rsp)
+        data_desc["path"] = file
+        data_desc["data_desc"] = rsp["data_desc"]
+        data_desc["column_desc"] = rsp["column_desc"]
+        data_desc["column_info"] = get_column_info(df)
        return data_desc


@ -184,6 +167,8 @@ class MLEngineer(Role):
                self.plan.finish_current_task()
                self.working_memory.clear()

+                if "print(df_processed.info())" in code:
+                    self.data_desc["column_info"] = result
            else:
                # update plan according to user's feedback and to take on changed tasks
                await self._update_plan()
@ -198,8 +183,7 @@ class MLEngineer(Role):
            print(truncate(result))

    async def _generate_data_desc(self):
-        files = glob.glob(self.data_path + "/*.csv")
-        data_desc = await GenerateDataDesc().run(files=files)
+        data_desc = await GenerateDataDesc().run(self.data_path)
        return data_desc

    async def _write_and_exec_code(self, max_retry: int = 3):
@ -221,14 +205,10 @@ class MLEngineer(Role):
            if counter == 0:
                context = self.get_useful_memories()
            else:
-                # improve_code = await DebugCode().run(plan=self.plan,
-                #                                      code= code_context + "\n\n" + code,
-                #                                      runtime_result=self.working_memory.get())
-                improve_code = ""
+                improve_code = await DebugCode().run(plan=self.plan,
+                                                     code= code_context + "\n\n" + code,
+                                                     runtime_result=self.working_memory.get())

-            # breakpoint()
-
-            column_names_dict = {key: value["column_info"] for key, value in self.data_desc.items()}

            if not self.use_tools or self.plan.current_task.task_type == "other":
                logger.info("Write code with pure generation")
@ -246,7 +226,7 @@ class MLEngineer(Role):
                    cause_by = DebugCode
                else:
                    code = await WriteCodeWithTools().run(
-                        context=context, plan=self.plan, code_steps=code_steps, **{"column_names": column_names_dict}
+                        context=context, plan=self.plan, code_steps=code_steps, **{"column_names": {}}
                    )

                    cause_by = WriteCodeWithTools
@ -260,7 +240,7 @@ class MLEngineer(Role):
            result, success = await self.execute_code.run(runcode)
            # truncated the result
            print(truncate(result))
-            # print(result)
+            
            self.working_memory.add(
                Message(content=truncate(remove_escape_and_color_codes(result)), role="user", cause_by=ExecutePyCode)
            )
@ -330,9 +310,8 @@ if __name__ == "__main__":
    # requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
    # requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"

-    from metagpt.const import DATA_PATH
-
    # requirement = "Perform data analysis on the provided data. Train a model to predict the target variable Survived. Include data preprocessing, feature engineering, and modeling in your pipeline. The metric is accuracy."
+
    data_path = f"{DATA_PATH}/titanic"
    requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."

--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@ -1,15 +1,12 @@
-
-import pandas as pd
 import numpy as np
-
 from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import KBinsDiscretizer
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MaxAbsScaler
-from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
+from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import StandardScaler

 from metagpt.tools.functions import registry
 from metagpt.tools.functions.schemas.data_preprocess import *
@ -21,13 +18,6 @@ def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean',
    return df


-# @registry.register("data_preprocess", FillMissingValue)
-# def label_encode(df: pd.DataFrame, features: list,):
-#     for col in features:
-#         df[col] = LabelEncoder().fit_transform(df[col])
-#     return df
-
-
@registry.register("data_preprocess", SplitBins)
 def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
    df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
@ -73,6 +63,17 @@ def ordinal_encode(df: pd.DataFrame, features: list,):
    return df


+@registry.register("data_preprocess", OneHotEncoding)
+def one_hot_encoding(df, cols):
+    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
+    ts_data = enc.fit_transform(df[cols])
+    new_columns = enc.get_feature_names_out(cols)
+    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
+    df.drop(cols, axis=1, inplace=True)
+    df = pd.concat([df, ts_data], axis=1)
+    return df
+
+
 if __name__ == '__main__':
    def run():
        V = {
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@ -8,7 +8,8 @@ import itertools

 from dateutil.relativedelta import relativedelta
 from pandas.api.types import is_numeric_dtype
-from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
+from sklearn.model_selection import KFold
+from sklearn.preprocessing import PolynomialFeatures

 from metagpt.tools.functions import registry
 from metagpt.tools.functions.schemas.feature_engineering import *
@ -29,17 +30,6 @@ def polynomial_expansion(df, cols, degree=2):
    return df


-@registry.register("feature_engineering", OneHotEncoding)
-def one_hot_encoding(df, cols):
-    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
-    ts_data = enc.fit_transform(df[cols])
-    new_columns = enc.get_feature_names_out(cols)
-    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
-    df.drop(cols, axis=1, inplace=True)
-    df = pd.concat([df, ts_data], axis=1)
-    return df
-
-
@registry.register("feature_engineering", FrequencyEncoding)
 def frequency_encoding(df, cols):
    for col in cols:
@ -48,6 +38,31 @@ def frequency_encoding(df, cols):
    return df


+@registry.register("feature_engineering", TargetMeanEncoder)
+def target_mean_encoder(df, col, label):
+    encoder_dict = df.groupby(col)[label].mean().to_dict()
+    df[f"{col}_target_mean"] = df[col].map(encoder_dict)
+    return df
+
+
+@registry.register("feature_engineering", KFoldTargetMeanEncoder)
+def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021):
+    tmp = df.copy()
+    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+
+    global_mean = tmp[label].mean()
+    col_name = f"{col}_kf_target_mean"
+    for trn_idx, val_idx in kf.split(tmp, tmp[label]):
+        _trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
+        tmp.loc[tmp.index[val_idx], col_name] = _val[col].map(
+            _trn.groupby(col)[label].mean()
+        )
+    tmp[col_name].fillna(global_mean, inplace=True)
+    encoder_dict = tmp.groupby(col)[col_name].mean().to_dict()
+    df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict)
+    return df
+
+
@registry.register("feature_engineering", CatCross)
 def cat_cross(df, cols, max_cat_num=100):
    for col in cols:
@ -56,7 +71,8 @@ def cat_cross(df, cols, max_cat_num=100):

    for col1, col2 in itertools.combinations(cols, 2):
        cross_col = f"{col1}_cross_{col2}"
-        df[cross_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
+        crossed = df[col1].astype(str) + "_" + df[col2].astype(str)
+        df[cross_col] = crossed.astype('category').cat.codes
    return df


--- a/metagpt/tools/functions/schemas/data_preprocess.py
+++ b/metagpt/tools/functions/schemas/data_preprocess.py
@ -8,14 +8,13 @@ class FillMissingValue(ToolSchema):
    """Completing missing values with simple strategies"""
    df: pd.DataFrame = tool_field(description="input dataframe")
    features: list = tool_field(description="columns to be processed")
-    strategy: str = tool_field(description="the imputation strategy", default='mean')
-    fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
-
-
-# class LabelEncode(ToolSchema):
-#     """Completing missing values with simple strategies"""
-#     df: pd.DataFrame = tool_field(description="input dataframe")
-#     features: list = tool_field(description="columns to be processed")
+    strategy: str = tool_field(
+        description="the imputation strategy",
+        default='mean',
+        enum=['mean', 'median', 'most_frequent', 'constant']
+    )
+    fill_value: int = tool_field(
+        description="fill_value is used to replace all occurrences of missing_values", default=None)


 class SplitBins(ToolSchema):
@ -60,3 +59,9 @@ class OrdinalEncode(ToolSchema):
    df: pd.DataFrame = tool_field(description="input dataframe")
    features: list = tool_field(description="columns to be processed")

+
+class OneHotEncoding(ToolSchema):
+    """Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
+
+    df: pd.DataFrame = tool_field(description="DataFrame to process.")
+    cols: list = tool_field(description="Categorical columns to be one-hot encoded and dropped.")
--- a/metagpt/tools/functions/schemas/feature_engineering.py
+++ b/metagpt/tools/functions/schemas/feature_engineering.py
@ -12,29 +12,39 @@ from metagpt.tools.functions.schemas.base import ToolSchema, tool_field


 class PolynomialExpansion(ToolSchema):
-    """Generate polynomial and interaction features from selected columns, excluding the bias column."""
+    """Add polynomial and interaction features from selected numeric columns, excluding the bias column."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    cols: list = tool_field(description="Columns for polynomial expansion.")
    degree: int = tool_field(description="Degree of polynomial features.", default=2)


-class OneHotEncoding(ToolSchema):
-    """Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    cols: list = tool_field(description="Categorical columns to be one-hot encoded.")
-
-
 class FrequencyEncoding(ToolSchema):
-    """Convert categorical columns to frequency encoding."""
+    """Add value counts of categorical columns as new features."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    cols: list = tool_field(description="Categorical columns to be frequency encoded.")


+class TargetMeanEncoder(ToolSchema):
+    """Encodes a categorical column by the mean of the label column, and adds the result as a new feature."""
+
+    df: pd.DataFrame = tool_field(description="DataFrame to process.")
+    col: str = tool_field(description="Column to be mean encoded.")
+    label: str = tool_field(description="Predicted label column.")
+
+
+class KFoldTargetMeanEncoder(ToolSchema):
+    """Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."""
+    df: pd.DataFrame = tool_field(description="DataFrame to process.")
+    col: str = tool_field(description="Column to be k-fold mean encoded.")
+    label: str = tool_field(description="Predicted label column.")
+    n_splits: int = tool_field(description="Number of splits for K-fold.", default=5)
+    random_state: int = tool_field(description="Random seed.", default=2021)
+
+
 class CatCross(ToolSchema):
-    """Create pairwise crossed features from categorical columns, joining values with '_'."""
+    """Add pairwise crossed features and convert them to numerical features."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    cols: list = tool_field(description="Columns to be pairwise crossed.")
@ -44,7 +54,7 @@ class CatCross(ToolSchema):


 class GroupStat(ToolSchema):
-    """Perform aggregation operations on a specified column grouped by certain categories."""
+    """Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    group_col: str = tool_field(description="Column used for grouping.")
@ -56,7 +66,7 @@ class GroupStat(ToolSchema):


 class ExtractTimeComps(ToolSchema):
-    """Extract specific time components from a designated time column in a DataFrame."""
+    """Extract and add specific time components as new features from a designated time column."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    time_col: str = tool_field(
@ -69,7 +79,7 @@ class ExtractTimeComps(ToolSchema):


 class FeShiftByTime(ToolSchema):
-    """Shift column values in a DataFrame based on specified time intervals."""
+    """Shift column values based on specified time intervals and add the resulting new features to the DataFrame. New features are named in the format of '<group_col>_<shift_col>_lag_<period>_<freq>'."""

    df: pd.DataFrame = tool_field(description="DataFrame to process.")
    time_col: str = tool_field(description="Column for time-based shifting.")
--- a/metagpt/utils/common.py
+++ b/metagpt/utils/common.py
@ -315,3 +315,17 @@ def create_func_config(func_schema: dict) -> dict:
        "tools": tools,
        "tool_choice": tool_choice,
    }
+
+
+def remove_comments(code_str):
+    """Remove comments from code."""
+    pattern = r"(\".*?\"|\'.*?\')|(\#.*?$)"
+    def replace_func(match):
+        if match.group(2) is not None:
+            return ""
+        else:
+            return match.group(1)
+
+    clean_code = re.sub(pattern, replace_func, code_str, flags=re.MULTILINE)
+    clean_code = os.linesep.join([s.rstrip() for s in clean_code.splitlines() if s.strip()])
+    return clean_code