From 7e343a100b449a8441ab55063ad76661d0391f46 Mon Sep 17 00:00:00 2001
From: lidanyang <lidanyang@fuzhi.ai>
Date: Thu, 7 Dec 2023 20:45:08 +0800
Subject: [PATCH 1/2] update ml functions

---
 .../tools/functions/libs/data_preprocess.py   | 29 ++++++-------
 .../functions/libs/feature_engineering.py     | 42 +++++++++++++------
 .../functions/schemas/data_preprocess.py      | 21 ++++++----
 .../functions/schemas/feature_engineering.py  | 36 ++++++++++------
 4 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py
index 68c96bbc9..5579c5bd8 100644
--- a/metagpt/tools/functions/libs/data_preprocess.py
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@@ -1,15 +1,12 @@
-
-import pandas as pd
 import numpy as np
-
 from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import KBinsDiscretizer
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import MaxAbsScaler
-from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
+from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import StandardScaler
 
 from metagpt.tools.functions import registry
 from metagpt.tools.functions.schemas.data_preprocess import *
@@ -21,13 +18,6 @@ def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean',
     return df
 
 
-# @registry.register("data_preprocess", FillMissingValue)
-# def label_encode(df: pd.DataFrame, features: list,):
-#     for col in features:
-#         df[col] = LabelEncoder().fit_transform(df[col])
-#     return df
-
-
 @registry.register("data_preprocess", SplitBins)
 def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
     df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
@@ -73,6 +63,17 @@ def ordinal_encode(df: pd.DataFrame, features: list,):
     return df
 
 
+@registry.register("data_preprocess", OneHotEncoding)
+def one_hot_encoding(df, cols):
+    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
+    ts_data = enc.fit_transform(df[cols])
+    new_columns = enc.get_feature_names_out(cols)
+    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
+    df.drop(cols, axis=1, inplace=True)
+    df = pd.concat([df, ts_data], axis=1)
+    return df
+
+
 if __name__ == '__main__':
     def run():
         V = {
diff --git a/metagpt/tools/functions/libs/feature_engineering.py b/metagpt/tools/functions/libs/feature_engineering.py
index 0573f362d..4780e4fa0 100644
--- a/metagpt/tools/functions/libs/feature_engineering.py
+++ b/metagpt/tools/functions/libs/feature_engineering.py
@@ -8,7 +8,8 @@ import itertools
 
 from dateutil.relativedelta import relativedelta
 from pandas.api.types import is_numeric_dtype
-from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
+from sklearn.model_selection import KFold
+from sklearn.preprocessing import PolynomialFeatures
 
 from metagpt.tools.functions import registry
 from metagpt.tools.functions.schemas.feature_engineering import *
@@ -29,17 +30,6 @@ def polynomial_expansion(df, cols, degree=2):
     return df
 
 
-@registry.register("feature_engineering", OneHotEncoding)
-def one_hot_encoding(df, cols):
-    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
-    ts_data = enc.fit_transform(df[cols])
-    new_columns = enc.get_feature_names_out(cols)
-    ts_data = pd.DataFrame(ts_data, columns=new_columns, index=df.index)
-    df.drop(cols, axis=1, inplace=True)
-    df = pd.concat([df, ts_data], axis=1)
-    return df
-
-
 @registry.register("feature_engineering", FrequencyEncoding)
 def frequency_encoding(df, cols):
     for col in cols:
@@ -48,6 +38,31 @@ def frequency_encoding(df, cols):
     return df
 
 
+@registry.register("feature_engineering", TargetMeanEncoder)
+def target_mean_encoder(df, col, label):
+    encoder_dict = df.groupby(col)[label].mean().to_dict()
+    df[f"{col}_target_mean"] = df[col].map(encoder_dict)
+    return df
+
+
+@registry.register("feature_engineering", KFoldTargetMeanEncoder)
+def k_fold_target_mean_encoder(df, col, label, n_splits=5, random_state=2021):
+    tmp = df.copy()
+    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+
+    global_mean = tmp[label].mean()
+    col_name = f"{col}_kf_target_mean"
+    for trn_idx, val_idx in kf.split(tmp, tmp[label]):
+        _trn, _val = tmp.iloc[trn_idx], tmp.iloc[val_idx]
+        tmp.loc[tmp.index[val_idx], col_name] = _val[col].map(
+            _trn.groupby(col)[label].mean()
+        )
+    tmp[col_name].fillna(global_mean, inplace=True)
+    encoder_dict = tmp.groupby(col)[col_name].mean().to_dict()
+    df[f"{col}_kf_target_mean"] = df[col].map(encoder_dict)
+    return df
+
+
 @registry.register("feature_engineering", CatCross)
 def cat_cross(df, cols, max_cat_num=100):
     for col in cols:
@@ -56,7 +71,8 @@ def cat_cross(df, cols, max_cat_num=100):
 
     for col1, col2 in itertools.combinations(cols, 2):
         cross_col = f"{col1}_cross_{col2}"
-        df[cross_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
+        crossed = df[col1].astype(str) + "_" + df[col2].astype(str)
+        df[cross_col] = crossed.astype('category').cat.codes
     return df
 
 
diff --git a/metagpt/tools/functions/schemas/data_preprocess.py b/metagpt/tools/functions/schemas/data_preprocess.py
index 40e1d64e0..16b97aeac 100644
--- a/metagpt/tools/functions/schemas/data_preprocess.py
+++ b/metagpt/tools/functions/schemas/data_preprocess.py
@@ -8,14 +8,13 @@ class FillMissingValue(ToolSchema):
     """Completing missing values with simple strategies"""
     df: pd.DataFrame = tool_field(description="input dataframe")
     features: list = tool_field(description="columns to be processed")
-    strategy: str = tool_field(description="the imputation strategy", default='mean')
-    fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
-
-
-# class LabelEncode(ToolSchema):
-#     """Completing missing values with simple strategies"""
-#     df: pd.DataFrame = tool_field(description="input dataframe")
-#     features: list = tool_field(description="columns to be processed")
+    strategy: str = tool_field(
+        description="the imputation strategy",
+        default='mean',
+        enum=['mean', 'median', 'most_frequent', 'constant']
+    )
+    fill_value: int = tool_field(
+        description="fill_value is used to replace all occurrences of missing_values", default=None)
 
 
 class SplitBins(ToolSchema):
@@ -60,3 +59,9 @@ class OrdinalEncode(ToolSchema):
     df: pd.DataFrame = tool_field(description="input dataframe")
     features: list = tool_field(description="columns to be processed")
 
+
+class OneHotEncoding(ToolSchema):
+    """Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
+
+    df: pd.DataFrame = tool_field(description="DataFrame to process.")
+    cols: list = tool_field(description="Categorical columns to be one-hot encoded and dropped.")
diff --git a/metagpt/tools/functions/schemas/feature_engineering.py b/metagpt/tools/functions/schemas/feature_engineering.py
index df2eebff6..5c89d9b16 100644
--- a/metagpt/tools/functions/schemas/feature_engineering.py
+++ b/metagpt/tools/functions/schemas/feature_engineering.py
@@ -12,29 +12,39 @@ from metagpt.tools.functions.schemas.base import ToolSchema, tool_field
 
 
 class PolynomialExpansion(ToolSchema):
-    """Generate polynomial and interaction features from selected columns, excluding the bias column."""
+    """Add polynomial and interaction features from selected numeric columns, excluding the bias column."""
 
     df: pd.DataFrame = tool_field(description="DataFrame to process.")
     cols: list = tool_field(description="Columns for polynomial expansion.")
     degree: int = tool_field(description="Degree of polynomial features.", default=2)
 
 
-class OneHotEncoding(ToolSchema):
-    """Apply one-hot encoding to specified categorical columns, the original columns will be dropped."""
-
-    df: pd.DataFrame = tool_field(description="DataFrame to process.")
-    cols: list = tool_field(description="Categorical columns to be one-hot encoded.")
-
-
 class FrequencyEncoding(ToolSchema):
-    """Convert categorical columns to frequency encoding."""
+    """Add value counts of categorical columns as new features."""
 
     df: pd.DataFrame = tool_field(description="DataFrame to process.")
     cols: list = tool_field(description="Categorical columns to be frequency encoded.")
 
 
+class TargetMeanEncoder(ToolSchema):
+    """Encodes a categorical column by the mean of the label column, and adds the result as a new feature."""
+
+    df: pd.DataFrame = tool_field(description="DataFrame to process.")
+    col: str = tool_field(description="Column to be mean encoded.")
+    label: str = tool_field(description="Predicted label column.")
+
+
+class KFoldTargetMeanEncoder(ToolSchema):
+    """Adds a new feature to the DataFrame by k-fold mean encoding of a categorical column using the label column."""
+    df: pd.DataFrame = tool_field(description="DataFrame to process.")
+    col: str = tool_field(description="Column to be k-fold mean encoded.")
+    label: str = tool_field(description="Predicted label column.")
+    n_splits: int = tool_field(description="Number of splits for K-fold.", default=5)
+    random_state: int = tool_field(description="Random seed.", default=2021)
+
+
 class CatCross(ToolSchema):
-    """Create pairwise crossed features from categorical columns, joining values with '_'."""
+    """Add pairwise crossed features and convert them to numerical features."""
 
     df: pd.DataFrame = tool_field(description="DataFrame to process.")
     cols: list = tool_field(description="Columns to be pairwise crossed.")
@@ -44,7 +54,7 @@ class CatCross(ToolSchema):
 
 
 class GroupStat(ToolSchema):
-    """Perform aggregation operations on a specified column grouped by certain categories."""
+    """Aggregate specified column in a DataFrame grouped by another column, adding new features named '<agg_col>_<agg_func>_by_<group_col>'."""
 
     df: pd.DataFrame = tool_field(description="DataFrame to process.")
     group_col: str = tool_field(description="Column used for grouping.")
@@ -56,7 +66,7 @@ class GroupStat(ToolSchema):
 
 
 class ExtractTimeComps(ToolSchema):
-    """Extract specific time components from a designated time column in a DataFrame."""
+    """Extract and add specific time components as new features from a designated time column."""
 
     df: pd.DataFrame = tool_field(description="DataFrame to process.")
     time_col: str = tool_field(
@@ -69,7 +79,7 @@ class ExtractTimeComps(ToolSchema):
 
 
 class FeShiftByTime(ToolSchema):
-    """Shift column values in a DataFrame based on specified time intervals."""
+    """Shift column values based on specified time intervals and add the resulting new features to the DataFrame. New features are named in the format of '<group_col>_<shift_col>_lag_<period>_<freq>'."""
 
     df: pd.DataFrame = tool_field(description="DataFrame to process.")
     time_col: str = tool_field(description="Column for time-based shifting.")

From fe2b79fedc407afe72ad855ea6187afe11108beb Mon Sep 17 00:00:00 2001
From: lidanyang <lidanyang@fuzhi.ai>
Date: Thu, 7 Dec 2023 20:48:00 +0800
Subject: [PATCH 2/2] refine ml prompt

---
 metagpt/actions/write_analysis_code.py | 114 +++++++++---------------
 metagpt/prompts/ml_engineer.py         | 118 ++++++++++++++++++++++---
 metagpt/roles/ml_engineer.py           |  75 ++++++----------
 metagpt/utils/common.py                |  14 +++
 4 files changed, 192 insertions(+), 129 deletions(-)

diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py
index 957d35f7e..f96ade1b4 100644
--- a/metagpt/actions/write_analysis_code.py
+++ b/metagpt/actions/write_analysis_code.py
@@ -15,15 +15,11 @@ from metagpt.prompts.ml_engineer import (
     TOO_ORGANIZATION_PROMPT,
     ML_SPECIFIC_PROMPT,
     ML_MODULE_MAP,
-    TOOL_OUTPUT_DESC,
-    TOOL_USAGE_PROMPT,
+    TOOL_OUTPUT_DESC, DATA_PROCESS_PROMPT,
 )
 from metagpt.schema import Message, Plan
 from metagpt.tools.functions import registry
-from metagpt.utils.common import create_func_config
-from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT, GENERATE_CODE_PROMPT
-from metagpt.utils.common import CodeParser
-from metagpt.actions.execute_code import ExecutePyCode
+from metagpt.utils.common import create_func_config, remove_comments
 
 
 class BaseWriteAnalysisCode(Action):
@@ -51,13 +47,13 @@ class BaseWriteAnalysisCode(Action):
 
         # 添加默认的提示词
         if (
-                default_system_msg not in messages[0]["content"]
-                and messages[0]["role"] != "system"
+            default_system_msg not in messages[0]["content"]
+            and messages[0]["role"] != "system"
         ):
             messages.insert(0, {"role": "system", "content": default_system_msg})
         elif (
-                default_system_msg not in messages[0]["content"]
-                and messages[0]["role"] == "system"
+            default_system_msg not in messages[0]["content"]
+            and messages[0]["role"] == "system"
         ):
             messages[0] = {
                 "role": "system",
@@ -66,7 +62,7 @@ class BaseWriteAnalysisCode(Action):
         return messages
 
     async def run(
-            self, context: List[Message], plan: Plan = None, code_steps: str = ""
+        self, context: List[Message], plan: Plan = None, code_steps: str = ""
     ) -> str:
         """Run of a code writing action, used in data analysis or modeling
 
@@ -87,12 +83,12 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):
         super().__init__(name, context, llm)
 
     async def run(
-            self,
-            context: [List[Message]],
-            plan: Plan = None,
-            code_steps: str = "",
-            system_msg: str = None,
-            **kwargs,
+        self,
+        context: [List[Message]],
+        plan: Plan = None,
+        code_steps: str = "",
+        system_msg: str = None,
+        **kwargs,
     ) -> str:
         context.append(Message(content=self.REUSE_CODE_INSTRUCTION, role="user"))
         prompt = self.process_msg(context, system_msg)
@@ -102,7 +98,6 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):
 
 class WriteCodeWithTools(BaseWriteAnalysisCode):
     """Write code with help of local available tools. Choose tools first, then generate code to use the tools"""
-    execute_code = ExecutePyCode()
 
     @staticmethod
     def _parse_recommend_tools(module: str, recommend_tools: list) -> List[Dict]:
@@ -126,10 +121,10 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
         return tool_catalog
 
     async def _tool_recommendation(
-            self,
-            context: [List[Message]],
-            code_steps: str,
-            available_tools: list
+        self,
+        task: str,
+        code_steps: str,
+        available_tools: list
     ) -> list:
         """
         Recommend tools for the specified task.
@@ -142,86 +137,63 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
         Returns:
             list: recommended tools for the specified task
         """
-        system_prompt = TOOL_RECOMMENDATION_PROMPT.format(
+        prompt = TOOL_RECOMMENDATION_PROMPT.format(
+            current_task=task,
             code_steps=code_steps,
             available_tools=available_tools,
         )
-        prompt = self.process_msg(context, system_prompt)
-
         tool_config = create_func_config(SELECT_FUNCTION_TOOLS)
         rsp = await self.llm.aask_code(prompt, **tool_config)
         recommend_tools = rsp["recommend_tools"]
         return recommend_tools
 
-
     async def run(
-            self,
-            context: List[Message],
-            plan: Plan = None,
-            code_steps: str = "",
-            **kwargs,
+        self,
+        context: List[Message],
+        plan: Plan = None,
+        code_steps: str = "",
+        column_info: str = "",
     ) -> str:
         task_type = plan.current_task.task_type
-        logger.info(f"task_type is: {task_type}")
         available_tools = registry.get_all_schema_by_module(task_type)
-
-        # special_prompt = ML_SPECIFIC_PROMPT.get(task_type, "")
+        special_prompt = ML_SPECIFIC_PROMPT.get(task_type, "")
 
         finished_tasks = plan.get_finished_tasks()
-        code_context = [task.code for task in finished_tasks]
-
+        code_context = [remove_comments(task.code) for task in finished_tasks]
         code_context = "\n\n".join(code_context)
 
-        ### add runtime info
-        result, success = await self.execute_code.run(code_context)
-        logger.info(result)
-
         if len(available_tools) > 0:
             available_tools = [
                 {k: tool[k] for k in ["name", "description"] if k in tool}
                 for tool in available_tools
             ]
 
-            final_code = code_context
-
-            recommend_tools = await self._tool_recommendation(context, code_steps, available_tools)
+            recommend_tools = await self._tool_recommendation(
+                plan.current_task.instruction,
+                code_steps,
+                available_tools
+            )
             tool_catalog = self._parse_recommend_tools(task_type, recommend_tools)
             logger.info(f"Recommended tools: \n{recommend_tools}")
 
             module_name = ML_MODULE_MAP[task_type]
             output_desc = TOOL_OUTPUT_DESC.get(task_type, "")
-
-            hist_info = f"Previous finished code is \n\n ```Python {final_code} ``` \n\n " \
-                        f"Runtime result is {result} \n\n"
-
-            prompt = TOOL_USAGE_PROMPT.format(
-                goal=plan.current_task.instruction,
-                context=hist_info,
+            prompt = DATA_PROCESS_PROMPT.format(
+                user_requirement=plan.goal,
+                history_code=code_context,
+                current_task=plan.current_task.instruction,
+                column_info=column_info,
+                special_prompt=special_prompt,
                 code_steps=code_steps,
                 module_name=module_name,
                 output_desc=output_desc,
                 function_catalog=tool_catalog,
             )
-
-            tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS)
-
-            rsp = await self.llm.aask_code(prompt, **tool_config)
-            logger.info(f"rsp is: {rsp}")
-            final_code = final_code + "\n\n" + rsp["code"]
-
-            return final_code
-
         else:
-            hist_info = f"Previous finished code is \n\n ```Python {code_context} ``` \n\n " \
-                     f"runtime result is {result} \n\n"
+            context.append(Message(content=self.REUSE_CODE_INSTRUCTION, role="user"))
+            context.append(Message(content=special_prompt, role="user"))
+            prompt = self.process_msg(context)
 
-            prompt = GENERATE_CODE_PROMPT.format(
-                goal=plan.current_task.instruction,
-                context=hist_info,
-            )
-
-            tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS)
-            logger.info(f"prompt is: {prompt}")
-            rsp = await self.llm.aask_code(prompt, **tool_config)
-            logger.info(f"rsp is: {rsp}")
-            return rsp["code"]
+        tool_config = create_func_config(CODE_GENERATOR_WITH_TOOLS)
+        rsp = await self.llm.aask_code(prompt, **tool_config)
+        return rsp['code']
diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py
index b68dadc9a..88cebf68a 100644
--- a/metagpt/prompts/ml_engineer.py
+++ b/metagpt/prompts/ml_engineer.py
@@ -8,19 +8,22 @@ GEN_DATA_DESC_PROMPT = """
 Here is the head 5 rows of the dataset:
 {data_head}
 
-Please provide a brief one-sentence background of the dataset, and concise descriptions for each column. Keep descriptions short yet informative.
+Please provide a brief one-sentence background of the dataset, and concise meaning for each column. Keep descriptions short.
 
 Output the information in a JSON format, as shown in this example:
 ```json
 {
     "data_desc": "Brief dataset background.",
     "column_desc": {
-        "column_name1": "Description of the first column.",
-        "column_name2": "Description of the second column.",
+        "column_name1": "Abstract meaning of the first column.",
+        "column_name2": "Abstract meaning of the second column.",
         ...
     }
 }
 ```
+
+# Constraints:
+- Don't contain specific values or examples found in the data column.
 """
 
 ASSIGN_TASK_TYPE_PROMPT = """
@@ -53,19 +56,22 @@ ASSIGN_TASK_TYPE = {
 }
 
 TOOL_RECOMMENDATION_PROMPT = """
-Your are a tool recommender, the main goal is to recommend suitable tools for current task before coding. A tool means a function that can be used to help you solve the task.
+## User Requirement:
+{current_task}
 
-## List of Available Tools:
-{available_tools}
-
-This is a task guide for the current task, including detailed code steps. You can refer to it when recommending tools.
+## Task
+Recommend up to five tools from 'Available Tools' that can help solve the 'User Requirement'. 
+This is a detailed code steps for current task. You can refer to it when recommending tools.
 {code_steps}
 
+## Available Tools:
+{available_tools}
+
 ## Tool Selection and Instructions:
-- For the task, choose up to five tools that are most likely to be useful in solving the task.
+- Select tools most relevant to completing the 'User Requirement'.
 - If you believe that no tools are suitable, indicate with an empty list.
 - Only list the names of the tools, not the full schema of each tool.
-- The result should only contain tool names that are in the list of available tools.
+- Ensure selected tools are listed in 'Available Tools'.
 """
 
 SELECT_FUNCTION_TOOLS = {
@@ -149,6 +155,34 @@ Finish your coding tasks as a helpful programmer based on the tools.
 
 """
 
+TOOL_USAGE_PROMPT = """
+## Target
+{goal}
+
+## History Info
+{context}
+
+## Available Tools:
+Each function is described in JSON format, including the function name and parameters. {output_desc}
+{function_catalog}
+
+When you call a function above, you should import the function from `{module_name}` first, e.g.:
+```python
+from metagpt.tools.functions.libs.data_preprocess import fill_missing_value
+```end
+
+## Your Output Format:
+Generate the complete code for this task:
+```python
+# Tools used: [function names or 'none']
+<your code for the current task, without any comments>
+```end
+
+## Attention:
+Make sure use the columns from the dataset columns
+Finish your coding tasks as a helpful programmer based on the tools.
+"""
+
 TOO_ORGANIZATION_PROMPT = """
 The previous conversation has provided all tasks step-by-step for the use goal and their statuses. 
 Now, begin writing code for the current task. This code should writen strictly on the basis of all previous completed tasks code, not a standalone code. And avoid writing duplicate code that has already been written in previous tasks, such as repeated import of packages, reading data, etc.
@@ -197,6 +231,66 @@ The current task is about feature engineering. when performing it, please adhere
 - Before generating a new feature, ensure the used features are already processed and ready to use.
 """
 
+DATA_PROCESS_PROMPT = """
+# Background
+As a data scientist, you need to help user to achieve the goal [{user_requirement}] step-by-step in an continuous Jupyter notebook.
+
+## Done Tasks
+```python
+{history_code}
+```end
+
+## Current Task
+{current_task}
+
+# Latest Data Info
+Latest data info after previous tasks:
+{column_info}
+
+# Task
+Write a Python function for 'Current Task'. Start by copying the input DataFrame. Avoid duplicating code from 'Done Tasks'.
+Specifically, {special_prompt}
+
+# Code Steps:
+Follow steps below when you writing code if it's convenient.
+{code_steps}
+
+# Capabilities
+- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of python functions.
+- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..
+- You can do anything about data preprocessing, feature engineering, model training, etc..
+
+# Available Tools:
+Each function tool is described in JSON format. {output_desc}
+When you call a function below, import the function from `{module_name}` first.
+{function_catalog}
+
+# Output Example:
+when current task is "fill missing value and handle outliers", the output code be like:
+```python
+from metagpt.tools.functions.libs.data_preprocess import fill_missing_value
+
+def function_name(df):
+    df_processed = df.copy()
+    num_cols = df_processed.select_dtypes(include='number').columns.tolist()
+    df_processed = fill_missing_value(df_processed, num_cols, 'mean')
+    
+    for col in num_cols:
+        low, high = df_processed[col].quantile([0.01, 0.99])
+        df_processed[col] = df_processed[col].clip(low, high)
+    return df_processed
+
+df_processed = function_name(df)
+print(df_processed.info())
+```end
+
+# Constraints:
+- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.
+- Prioritize using pre-defined tools for the same functionality.
+- Return DataFrame should always be named `df_processed`, while the input DataFrame should based on the done tasks' output DataFrame.
+- Limit to one print statement for the output DataFrame's info.
+"""
+
 MODEL_TRAIN_PROMPT = """
 The current task is about training a model, please ensure high performance:
 - Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as lightGBM, XGBoost, CatBoost, etc.
@@ -204,9 +298,9 @@ The current task is about training a model, please ensure high performance:
 - Use the data from previous task result directly, do not mock or reload data yourself.
 """
 
-DATA_PREPROCESS_OUTPUT_DESC = "Please note that all functions uniformly output a processed pandas.DataFrame, facilitating seamless integration into the broader workflow."
+DATA_PREPROCESS_OUTPUT_DESC = "Please note that all functions output a updated pandas.DataFrame after data preprocessing."
 
-FEATURE_ENGINEERING_OUTPUT_DESC = "Please note that all functions uniformly output updated pandas.DataFrame with feature engineering applied."
+FEATURE_ENGINEERING_OUTPUT_DESC = "Please note that all functions output a updated pandas.DataFrame with new features added or existing features modified."
 
 CLASSIFICATION_MODEL_OUTPUT_DESC = ""
 
diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py
index deb76f0a9..4ad24df52 100644
--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@@ -1,21 +1,21 @@
-import glob
 import json
+import re
 from typing import List
 
 import fire
 import pandas as pd
-import re
 
 from metagpt.actions import Action
 from metagpt.actions.execute_code import ExecutePyCode
 from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
+from metagpt.actions.write_code_steps import WriteCodeSteps
 from metagpt.actions.write_plan import WritePlan
+from metagpt.const import DATA_PATH
 from metagpt.logs import logger
 from metagpt.prompts.ml_engineer import GEN_DATA_DESC_PROMPT
 from metagpt.roles import Role
 from metagpt.schema import Message, Plan
 from metagpt.utils.common import CodeParser
-from metagpt.actions.write_code_steps import WriteCodeSteps
 
 STRUCTURAL_CONTEXT = """
 ## User Requirement
@@ -70,32 +70,16 @@ def read_data(file: str) -> pd.DataFrame:
     return df
 
 
-def get_samples(df: pd.DataFrame) -> str:
+def get_column_info(df: pd.DataFrame) -> str:
     data = []
-
-    if len(df) > 5:
-        df_ = df.sample(5, random_state=0)
-    else:
-        df_ = df
-
-    for i in list(df_):
+    for i in df.columns:
         nan_freq = float("%.2g" % (df[i].isna().mean() * 100))
         n_unique = df[i].nunique()
-        s = df_[i].tolist()
+        data.append([i, df[i].dtype, nan_freq, n_unique])
 
-        if str(df[i].dtype) == "float64":
-            s = [round(sample, 2) if not pd.isna(sample) else None for sample in s]
-
-        data.append([df_[i].name, df[i].dtype, nan_freq, n_unique, s])
     samples = pd.DataFrame(
         data,
-        columns=[
-            "Column_name",
-            "Data_type",
-            "NaN_Frequency(%)",
-            "N_unique",
-            "Samples",
-        ],
+        columns=["Column_name", "Data_type", "NaN_Frequency(%)", "N_unique"],
     )
     return samples.to_string(index=False)
 
@@ -124,20 +108,19 @@ class AskReview(Action):
 
 
 class GenerateDataDesc(Action):
-    async def run(self, files: list) -> dict:
+    async def run(self, file: str) -> dict:
         data_desc = {}
-        for file in files:
-            df = read_data(file)
-            file_name = file.split("/")[-1]
-            data_head = df.head().to_dict(orient="list")
-            data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
-            prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
-            rsp = await self._aask(prompt)
-            rsp = CodeParser.parse_code(block=None, text=rsp)
-            data_desc[file_name] = {}
-            data_desc[file_name]["path"] = file
-            data_desc[file_name]["description"] = rsp
-            data_desc[file_name]["column_info"] = get_samples(df)
+        df = read_data(file)
+        data_head = df.head().to_dict(orient="list")
+        data_head = json.dumps(data_head, indent=4, ensure_ascii=False)
+        prompt = GEN_DATA_DESC_PROMPT.replace("{data_head}", data_head)
+        rsp = await self._aask(prompt)
+        rsp = CodeParser.parse_code(block=None, text=rsp)
+        rsp = json.loads(rsp)
+        data_desc["path"] = file
+        data_desc["data_desc"] = rsp["data_desc"]
+        data_desc["column_desc"] = rsp["column_desc"]
+        data_desc["column_info"] = get_column_info(df)
         return data_desc
 
 
@@ -159,7 +142,6 @@ class MLEngineer(Role):
         if self.data_path:
             self.data_desc = await self._generate_data_desc()
 
-
         # create initial plan and update until confirmation
         await self._update_plan()
 
@@ -181,13 +163,14 @@ class MLEngineer(Role):
                 self.plan.finish_current_task()
                 self.working_memory.clear()
 
+                if "print(df_processed.info())" in code:
+                    self.data_desc["column_info"] = result
             else:
                 # update plan according to user's feedback and to take on changed tasks
                 await self._update_plan()
 
     async def _generate_data_desc(self):
-        files = glob.glob(self.data_path + "/*.csv")
-        data_desc = await GenerateDataDesc().run(files=files)
+        data_desc = await GenerateDataDesc().run(self.data_path)
         return data_desc
 
     async def _write_and_exec_code(self, max_retry: int = 3):
@@ -201,9 +184,11 @@ class MLEngineer(Role):
         success = False
         while not success and counter < max_retry:
             context = self.get_useful_memories()
-            # breakpoint()
 
-            column_names_dict = {key: value["column_info"] for key,value in self.data_desc.items()}
+            # print("*" * 10)
+            # print(context)
+            # print("*" * 10)
+            # breakpoint()
 
             if not self.use_tools or self.plan.current_task.task_type == "other":
                 logger.info("Write code with pure generation")
@@ -214,9 +199,9 @@ class MLEngineer(Role):
                 cause_by = WriteCodeByGenerate
             else:
                 logger.info("Write code with tools")
-
+                column_info = self.data_desc['column_info']
                 code = await WriteCodeWithTools().run(
-                    context=context, plan=self.plan, code_steps=code_steps, **{"column_names": column_names_dict}
+                    context=context, plan=self.plan, code_steps=code_steps, column_info=column_info
                 )
                 cause_by = WriteCodeWithTools
 
@@ -296,10 +281,8 @@ if __name__ == "__main__":
     # requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
     # requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"
 
-    from metagpt.const import DATA_PATH
-
     requirement = "Perform data analysis on the provided data. Train a model to predict the target variable Survived. Include data preprocessing, feature engineering, and modeling in your pipeline. The metric is accuracy."
-    data_path = f"{DATA_PATH}/titanic"
+    data_path = f"{DATA_PATH}/titanic.csv"
 
     async def main(requirement: str = requirement, auto_run: bool = True, data_path: str = data_path):
         role = MLEngineer(goal=requirement, auto_run=auto_run, data_path=data_path)
diff --git a/metagpt/utils/common.py b/metagpt/utils/common.py
index 8f8edbc6d..168966ef7 100644
--- a/metagpt/utils/common.py
+++ b/metagpt/utils/common.py
@@ -315,3 +315,17 @@ def create_func_config(func_schema: dict) -> dict:
         "tools": tools,
         "tool_choice": tool_choice,
     }
+
+
+def remove_comments(code_str):
+    """Remove comments from code."""
+    pattern = r"(\".*?\"|\'.*?\')|(\#.*?$)"
+    def replace_func(match):
+        if match.group(2) is not None:
+            return ""
+        else:
+            return match.group(1)
+
+    clean_code = re.sub(pattern, replace_func, code_str, flags=re.MULTILINE)
+    clean_code = os.linesep.join([s.rstrip() for s in clean_code.splitlines() if s.strip()])
+    return clean_code