From 4304dd28cae93e3a2c597bf139bcd2d7783b3dad Mon Sep 17 00:00:00 2001
From: wubinhao <15754305168@163.com>
Date: Tue, 5 Dec 2023 17:57:56 +0800
Subject: [PATCH 1/5] update write task guide, add code plan

---
 metagpt/actions/write_task_guide.py | 82 +++++++++++++++++++++++++++++
 metagpt/roles/ml_engineer.py        | 21 ++++----
 metagpt/schema.py                   |  1 +
 3 files changed, 92 insertions(+), 12 deletions(-)
 create mode 100644 metagpt/actions/write_task_guide.py

diff --git a/metagpt/actions/write_task_guide.py b/metagpt/actions/write_task_guide.py
new file mode 100644
index 000000000..eff53feef
--- /dev/null
+++ b/metagpt/actions/write_task_guide.py
@@ -0,0 +1,82 @@
+
+import json
+from typing import Dict, List, Union
+
+from metagpt.actions import Action
+from metagpt.schema import Message, Task, Plan
+
+
+TASK_GUIDE_PROMPT_TEMPLATE = """
+# Context
+{context}
+
+##  Format example
+1.
+2.
+3.
+...
+
+-----
+Tasks are all code development tasks.
+You are a professional engineer, the main goal is to plan out concise solution steps for Current Task before coding.
+A planning process can reduce the difficulty and improve the quality of coding.
+You may be given some code plans for the tasks ahead, but you don't have to follow the existing plan when planning the current task.
+The output plan should following the subsequent principles:
+1.The plan is a rough checklist of steps outlining the entire program's structure.Try to keep the number of steps fewer than 5.
+2.The steps should be written concisely and at a high level, avoiding overly detailed implementation specifics.
+3.The execution of the plan happens sequentially, but the plan can incorporate conditional (if) and looping(loop) keywords for more complex structures.
+4.Output carefully referenced "Format example" in format.
+"""
+
+STRUCTURAL_CONTEXT = """
+## User Requirement
+{user_requirement}
+## Current Plan
+{tasks}
+## Current Task
+{current_task}
+"""
+
+
+class WriteTaskGuide(Action):
+
+    async def run(self, plan: Plan) -> str:
+        """Run of a task guide writing action, used in ml engineer
+
+        Args:
+            plan (plan): task plan
+            useful_memories (list): useful_memories
+        Returns:
+            str: The dataset_descriptions string.
+        """
+
+        context = self.get_context(plan)
+        task_guide_prompt = TASK_GUIDE_PROMPT_TEMPLATE.format(
+            context=context,
+        )
+        task_guide = await self._aask(task_guide_prompt)
+        return task_guide
+
+    def get_context(self, plan: Plan):
+        user_requirement = plan.goal
+        task_rename_map = {
+            'task_id': 'task_id',
+            'instruction': 'instruction',
+            'is_finished': 'is_finished',
+            # 'task_guide': 'code_plan'
+        }
+
+        def process_task(task):
+            task_dict = task.dict()
+            ptask = {task_rename_map[k]: task_dict[k] for k in task_dict if k in task_rename_map}
+            return ptask
+        tasks = json.dumps(
+            [process_task(task) for task in plan.tasks], indent=4, ensure_ascii=False
+        )
+        current_task = json.dumps(process_task(plan.current_task)) if plan.current_task else {}
+        context = STRUCTURAL_CONTEXT.format(
+            user_requirement=user_requirement, tasks=tasks, current_task=current_task
+        )
+        # print(context)
+        return context
+
diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py
index 65583638e..d905b7bfd 100644
--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@@ -12,6 +12,7 @@ from metagpt.logs import logger
 from metagpt.actions.write_plan import WritePlan
 from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
 from metagpt.actions.execute_code import ExecutePyCode
+from metagpt.actions.write_task_guide import WriteTaskGuide
 
 STRUCTURAL_CONTEXT = """
 ## User Requirement
@@ -66,11 +67,6 @@ class AskReview(Action):
         return rsp, confirmed
 
 
-class WriteTaskGuide(Action):
-    async def run(self, task_instruction: str, data_desc: str = "") -> str:
-        return ""
-
-
 class MLEngineer(Role):
     def __init__(
         self, name="ABC", profile="MLEngineer", goal="", auto_run: bool = False
@@ -79,7 +75,7 @@ class MLEngineer(Role):
         self._set_react_mode(react_mode="plan_and_act")
         self.plan = Plan(goal=goal)
         self.use_tools = False
-        self.use_task_guide = False
+        self.use_task_guide = True
         self.execute_code = ExecutePyCode()
         self.auto_run = auto_run
 
@@ -92,7 +88,7 @@ class MLEngineer(Role):
             logger.info(f"ready to take on task {task}")
 
             # take on current task
-            code, result, success = await self._write_and_exec_code()
+            code, result, success, task_guide = await self._write_and_exec_code()
 
             # ask for acceptance, users can other refuse and change tasks in the plan
             task_result_confirmed = await self._ask_review()
@@ -101,6 +97,7 @@ class MLEngineer(Role):
                 # tick off this task and record progress
                 task.code = code
                 task.result = result
+                task.task_guide = task_guide
                 self.plan.finish_current_task()
                 self.working_memory.clear()
 
@@ -110,7 +107,7 @@ class MLEngineer(Role):
 
     async def _write_and_exec_code(self, max_retry: int = 3):
         task_guide = (
-            await WriteTaskGuide().run(self.plan.current_task.instruction)
+            await WriteTaskGuide().run(self.plan)
             if self.use_task_guide
             else ""
         )
@@ -156,7 +153,7 @@ class MLEngineer(Role):
 
             counter += 1
 
-        return code, result, success
+        return code, result, success, task_guide
 
     async def _ask_review(self):
         if not self.auto_run:
@@ -185,7 +182,7 @@ class MLEngineer(Role):
 
     def get_useful_memories(self) -> List[Message]:
         """find useful memories only to reduce context length and improve performance"""
-
+        # TODO dataset description , code steps
         user_requirement = self.plan.goal
         tasks = json.dumps(
             [task.dict() for task in self.plan.tasks], indent=4, ensure_ascii=False
@@ -204,9 +201,9 @@ class MLEngineer(Role):
 
 
 if __name__ == "__main__":
-    requirement = "Run data analysis on sklearn Iris dataset, include a plot"
+    # requirement = "Run data analysis on sklearn Iris dataset, include a plot"
     # requirement = "Run data analysis on sklearn Diabetes dataset, include a plot"
-    # requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy"
+    requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy"
     # requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
     # requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"
 
diff --git a/metagpt/schema.py b/metagpt/schema.py
index e39f54a0c..db6861280 100644
--- a/metagpt/schema.py
+++ b/metagpt/schema.py
@@ -81,6 +81,7 @@ class Task(BaseModel):
     code: str = ""
     result: str = ""
     is_finished: bool = False
+    task_guide: str = ""
 
 
 class Plan(BaseModel):

From 7436150849344945de0d7783538f9e7d7f44fb41 Mon Sep 17 00:00:00 2001
From: wubinhao <15754305168@163.com>
Date: Tue, 5 Dec 2023 18:02:02 +0800
Subject: [PATCH 2/5] add code plan

---
 metagpt/actions/write_task_guide.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metagpt/actions/write_task_guide.py b/metagpt/actions/write_task_guide.py
index eff53feef..75067d33c 100644
--- a/metagpt/actions/write_task_guide.py
+++ b/metagpt/actions/write_task_guide.py
@@ -63,7 +63,7 @@ class WriteTaskGuide(Action):
             'task_id': 'task_id',
             'instruction': 'instruction',
             'is_finished': 'is_finished',
-            # 'task_guide': 'code_plan'
+            'task_guide': 'code_plan'
         }
 
         def process_task(task):

From 2e7abe7d0342c13f782c878662f065a7a1b829eb Mon Sep 17 00:00:00 2001
From: wubinhao <15754305168@163.com>
Date: Wed, 6 Dec 2023 11:24:24 +0800
Subject: [PATCH 3/5] change task_guide to code_steps

---
 metagpt/actions/write_analysis_code.py        | 12 ++++----
 ...rite_task_guide.py => write_code_steps.py} | 21 +++++--------
 metagpt/llm.py                                |  2 +-
 metagpt/roles/ml_engineer.py                  | 30 +++++++++----------
 metagpt/schema.py                             |  2 +-
 5 files changed, 31 insertions(+), 36 deletions(-)
 rename metagpt/actions/{write_task_guide.py => write_code_steps.py} (80%)

diff --git a/metagpt/actions/write_analysis_code.py b/metagpt/actions/write_analysis_code.py
index db0df2f90..1127dc78b 100644
--- a/metagpt/actions/write_analysis_code.py
+++ b/metagpt/actions/write_analysis_code.py
@@ -85,7 +85,7 @@ class WriteCodeByGenerate(BaseWriteAnalysisCode):
         self,
         context: [List[Message]],
         plan: Plan = None,
-        task_guide: str = "",
+        code_steps: str = "",
         system_msg: str = None,
         **kwargs,
     ) -> str:
@@ -155,7 +155,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
         self,
         context: List[Message],
         plan: Plan = None,
-        task_guide: str = "",
+        code_steps: str = "",
         data_desc: str = "",
     ) -> str:
         task_type = plan.current_task.task_type
@@ -165,12 +165,12 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
             {k: tool[k] for k in ["name", "description"] if k in tool}
             for tool in available_tools
         ]
-        task_guide = "\n".join(
-            [f"Step {step.strip()}" for step in task_guide.split("\n")]
+        code_steps = "\n".join(
+            [f"Step {step.strip()}" for step in code_steps.split("\n")]
         )
 
         recommend_tools = await self._tool_recommendation(
-            task, task_guide, available_tools
+            task, code_steps, available_tools
         )
         recommend_tools, tool_catalog = self._parse_recommend_tools(task_type, recommend_tools)
         logger.info(f"Recommended tools for every steps: {recommend_tools}")
@@ -194,7 +194,7 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
             completed_code=completed_code,
             data_desc=data_desc,
             special_prompt=special_prompt,
-            code_steps=task_guide,
+            code_steps=code_steps,
             module_name=module_name,
             output_desc=output_desc,
             available_tools=recommend_tools,
diff --git a/metagpt/actions/write_task_guide.py b/metagpt/actions/write_code_steps.py
similarity index 80%
rename from metagpt/actions/write_task_guide.py
rename to metagpt/actions/write_code_steps.py
index 75067d33c..47ea0b1df 100644
--- a/metagpt/actions/write_task_guide.py
+++ b/metagpt/actions/write_code_steps.py
@@ -6,7 +6,7 @@ from metagpt.actions import Action
 from metagpt.schema import Message, Task, Plan
 
 
-TASK_GUIDE_PROMPT_TEMPLATE = """
+CODE_STEPS_PROMPT_TEMPLATE = """
 # Context
 {context}
 
@@ -38,7 +38,7 @@ STRUCTURAL_CONTEXT = """
 """
 
 
-class WriteTaskGuide(Action):
+class WriteCodeSteps(Action):
 
     async def run(self, plan: Plan) -> str:
         """Run of a task guide writing action, used in ml engineer
@@ -51,24 +51,19 @@ class WriteTaskGuide(Action):
         """
 
         context = self.get_context(plan)
-        task_guide_prompt = TASK_GUIDE_PROMPT_TEMPLATE.format(
+        code_steps_prompt = CODE_STEPS_PROMPT_TEMPLATE.format(
             context=context,
         )
-        task_guide = await self._aask(task_guide_prompt)
-        return task_guide
+        code_steps = await self._aask(code_steps_prompt)
+        return code_steps
 
     def get_context(self, plan: Plan):
         user_requirement = plan.goal
-        task_rename_map = {
-            'task_id': 'task_id',
-            'instruction': 'instruction',
-            'is_finished': 'is_finished',
-            'task_guide': 'code_plan'
-        }
+        select_task_keys = ['task_id', 'instruction', 'is_finished', 'code_steps']
 
         def process_task(task):
             task_dict = task.dict()
-            ptask = {task_rename_map[k]: task_dict[k] for k in task_dict if k in task_rename_map}
+            ptask = {k: task_dict[k] for k in task_dict if k in select_task_keys}
             return ptask
         tasks = json.dumps(
             [process_task(task) for task in plan.tasks], indent=4, ensure_ascii=False
@@ -77,6 +72,6 @@ class WriteTaskGuide(Action):
         context = STRUCTURAL_CONTEXT.format(
             user_requirement=user_requirement, tasks=tasks, current_task=current_task
         )
-        # print(context)
+        print(context)
         return context
 
diff --git a/metagpt/llm.py b/metagpt/llm.py
index 4edcd7a83..c8ddf9a26 100644
--- a/metagpt/llm.py
+++ b/metagpt/llm.py
@@ -11,7 +11,7 @@ from metagpt.config import CONFIG
 from metagpt.provider.anthropic_api import Claude2 as Claude
 from metagpt.provider.openai_api import OpenAIGPTAPI
 from metagpt.provider.zhipuai_api import ZhiPuAIGPTAPI
-from metagpt.provider.spark_api import SparkAPI
+# from metagpt.provider.spark_api import SparkAPI
 from metagpt.provider.human_provider import HumanProvider
 
 
diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py
index d905b7bfd..e957d66c4 100644
--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@@ -12,7 +12,7 @@ from metagpt.logs import logger
 from metagpt.actions.write_plan import WritePlan
 from metagpt.actions.write_analysis_code import WriteCodeByGenerate, WriteCodeWithTools
 from metagpt.actions.execute_code import ExecutePyCode
-from metagpt.actions.write_task_guide import WriteTaskGuide
+from metagpt.actions.write_code_steps import WriteCodeSteps
 
 STRUCTURAL_CONTEXT = """
 ## User Requirement
@@ -75,7 +75,7 @@ class MLEngineer(Role):
         self._set_react_mode(react_mode="plan_and_act")
         self.plan = Plan(goal=goal)
         self.use_tools = False
-        self.use_task_guide = True
+        self.use_code_steps = True
         self.execute_code = ExecutePyCode()
         self.auto_run = auto_run
 
@@ -88,7 +88,7 @@ class MLEngineer(Role):
             logger.info(f"ready to take on task {task}")
 
             # take on current task
-            code, result, success, task_guide = await self._write_and_exec_code()
+            code, result, success, code_steps = await self._write_and_exec_code()
 
             # ask for acceptance, users can other refuse and change tasks in the plan
             task_result_confirmed = await self._ask_review()
@@ -97,7 +97,7 @@ class MLEngineer(Role):
                 # tick off this task and record progress
                 task.code = code
                 task.result = result
-                task.task_guide = task_guide
+                task.code_steps = code_steps
                 self.plan.finish_current_task()
                 self.working_memory.clear()
 
@@ -106,9 +106,9 @@ class MLEngineer(Role):
                 await self._update_plan()
 
     async def _write_and_exec_code(self, max_retry: int = 3):
-        task_guide = (
-            await WriteTaskGuide().run(self.plan)
-            if self.use_task_guide
+        code_steps = (
+            await WriteCodeSteps().run(self.plan)
+            if self.use_code_steps
             else ""
         )
 
@@ -123,14 +123,14 @@ class MLEngineer(Role):
             # breakpoint()
 
             if not self.use_tools or self.plan.current_task.task_type == "other":
-                # code = "print('abc')"
-                code = await WriteCodeByGenerate().run(
-                    context=context, plan=self.plan, task_guide=task_guide, temperature=0.0
-                )
+                code = "print('abc')"
+                # code = await WriteCodeByGenerate().run(
+                #     context=context, plan=self.plan, code_steps=code_steps, temperature=0.0
+                # )
                 cause_by = WriteCodeByGenerate
             else:
                 code = await WriteCodeWithTools().run(
-                    context=context, plan=self.plan, task_guide=task_guide, data_desc=""
+                    context=context, plan=self.plan, code_steps=code_steps, data_desc=""
                 )
                 cause_by = WriteCodeWithTools
 
@@ -153,7 +153,7 @@ class MLEngineer(Role):
 
             counter += 1
 
-        return code, result, success, task_guide
+        return code, result, success, code_steps
 
     async def _ask_review(self):
         if not self.auto_run:
@@ -203,9 +203,9 @@ class MLEngineer(Role):
 if __name__ == "__main__":
     # requirement = "Run data analysis on sklearn Iris dataset, include a plot"
     # requirement = "Run data analysis on sklearn Diabetes dataset, include a plot"
-    requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy"
+    # requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy"
     # requirement = "Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy"
-    # requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"
+    requirement = "Run EDA and visualization on this dataset, train a model to predict survival, report metrics on validation set (20%), dataset: workspace/titanic/train.csv"
 
     async def main(requirement: str = requirement, auto_run: bool = False):
         role = MLEngineer(goal=requirement, auto_run=auto_run)
diff --git a/metagpt/schema.py b/metagpt/schema.py
index db6861280..2e4260096 100644
--- a/metagpt/schema.py
+++ b/metagpt/schema.py
@@ -81,7 +81,7 @@ class Task(BaseModel):
     code: str = ""
     result: str = ""
     is_finished: bool = False
-    task_guide: str = ""
+    code_steps: str = ""
 
 
 class Plan(BaseModel):

From 58e8e4c87936d6bf721f91109d4595a864a23203 Mon Sep 17 00:00:00 2001
From: wubinhao <15754305168@163.com>
Date: Wed, 6 Dec 2023 15:56:26 +0800
Subject: [PATCH 4/5] fix

---
 metagpt/actions/write_code_steps.py | 2 +-
 metagpt/llm.py                      | 2 +-
 metagpt/roles/ml_engineer.py        | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/metagpt/actions/write_code_steps.py b/metagpt/actions/write_code_steps.py
index 47ea0b1df..d3f6e5553 100644
--- a/metagpt/actions/write_code_steps.py
+++ b/metagpt/actions/write_code_steps.py
@@ -72,6 +72,6 @@ class WriteCodeSteps(Action):
         context = STRUCTURAL_CONTEXT.format(
             user_requirement=user_requirement, tasks=tasks, current_task=current_task
         )
-        print(context)
+        # print(context)
         return context
 
diff --git a/metagpt/llm.py b/metagpt/llm.py
index c8ddf9a26..4edcd7a83 100644
--- a/metagpt/llm.py
+++ b/metagpt/llm.py
@@ -11,7 +11,7 @@ from metagpt.config import CONFIG
 from metagpt.provider.anthropic_api import Claude2 as Claude
 from metagpt.provider.openai_api import OpenAIGPTAPI
 from metagpt.provider.zhipuai_api import ZhiPuAIGPTAPI
-# from metagpt.provider.spark_api import SparkAPI
+from metagpt.provider.spark_api import SparkAPI
 from metagpt.provider.human_provider import HumanProvider
 
 
diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py
index e957d66c4..ce0689497 100644
--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@@ -123,10 +123,10 @@ class MLEngineer(Role):
             # breakpoint()
 
             if not self.use_tools or self.plan.current_task.task_type == "other":
-                code = "print('abc')"
-                # code = await WriteCodeByGenerate().run(
-                #     context=context, plan=self.plan, code_steps=code_steps, temperature=0.0
-                # )
+                # code = "print('abc')"
+                code = await WriteCodeByGenerate().run(
+                    context=context, plan=self.plan, code_steps=code_steps, temperature=0.0
+                )
                 cause_by = WriteCodeByGenerate
             else:
                 code = await WriteCodeWithTools().run(

From 029adbc6d6fcc10e1cd553e2412b2355de36f2e8 Mon Sep 17 00:00:00 2001
From: wubinhao <15754305168@163.com>
Date: Wed, 6 Dec 2023 16:48:31 +0800
Subject: [PATCH 5/5] update functions

---
 .../tools/functions/libs/data_preprocess.py   | 123 +++++++++++
 metagpt/tools/functions/libs/ml_model.py      | 196 ++++++++++++++++++
 .../functions/schemas/data_preprocess.py      |  62 ++++++
 metagpt/tools/functions/schemas/ml_model.py   |  55 +++++
 4 files changed, 436 insertions(+)
 create mode 100644 metagpt/tools/functions/libs/data_preprocess.py
 create mode 100644 metagpt/tools/functions/libs/ml_model.py
 create mode 100644 metagpt/tools/functions/schemas/data_preprocess.py
 create mode 100644 metagpt/tools/functions/schemas/ml_model.py

diff --git a/metagpt/tools/functions/libs/data_preprocess.py b/metagpt/tools/functions/libs/data_preprocess.py
new file mode 100644
index 000000000..68c96bbc9
--- /dev/null
+++ b/metagpt/tools/functions/libs/data_preprocess.py
@@ -0,0 +1,123 @@
+
+import pandas as pd
+import numpy as np
+
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import KBinsDiscretizer
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MaxAbsScaler
+from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import OrdinalEncoder
+
+from metagpt.tools.functions import registry
+from metagpt.tools.functions.schemas.data_preprocess import *
+
+
+@registry.register("data_preprocess", FillMissingValue)
+def fill_missing_value(df: pd.DataFrame, features: list, strategy: str = 'mean', fill_value=None,):
+    df[features] = SimpleImputer(strategy=strategy, fill_value=fill_value).fit_transform(df[features])
+    return df
+
+
+# @registry.register("data_preprocess", FillMissingValue)
+# def label_encode(df: pd.DataFrame, features: list,):
+#     for col in features:
+#         df[col] = LabelEncoder().fit_transform(df[col])
+#     return df
+
+
+@registry.register("data_preprocess", SplitBins)
+def split_bins(df: pd.DataFrame, features: list, strategy: str = 'quantile',):
+    df[features] = KBinsDiscretizer(strategy=strategy, encode='ordinal').fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", MinMaxScale)
+def min_max_scale(df: pd.DataFrame, features: list, ):
+    df[features] = MinMaxScaler().fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", StandardScale)
+def standard_scale(df: pd.DataFrame, features: list, ):
+    df[features] = StandardScaler().fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", LogTransform)
+def log_transform(df: pd.DataFrame, features: list, ):
+    for col in features:
+        if df[col].min() <= 0:
+            df[col] = df[col] - df[col].min() + 2
+        df[col] = np.log(df[col])
+    return df
+
+
+@registry.register("data_preprocess", MaxAbsScale)
+def max_abs_scale(df: pd.DataFrame, features: list, ):
+    df[features] = MaxAbsScaler().fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", RobustScale)
+def robust_scale(df: pd.DataFrame, features: list, ):
+    df[features] = RobustScaler().fit_transform(df[features])
+    return df
+
+
+@registry.register("data_preprocess", OrdinalEncode)
+def ordinal_encode(df: pd.DataFrame, features: list,):
+    df[features] = OrdinalEncoder().fit_transform(df[features])
+    return df
+
+
+if __name__ == '__main__':
+    def run():
+        V = {
+            'a': [-1, 2, 3, 6, 5, 4],
+            'b': [1.1, 2.2, 3.3, 6.6, 5.5, 4.4],
+            'c': ['aa', 'bb', 'cc', 'dd', 'ee', 'ff'],
+            'd': [1, None, 3, None, 5, 4],
+            'e': [1.1, np.NAN, 3.3, None, 5.5, 4.4],
+            'f': ['aa', np.NAN, 'cc', None, '', 'ff'],
+
+        }
+
+        df = pd.DataFrame(V)
+        print(df.dtypes)
+
+        numeric_features = ['a', 'b', 'd', 'e']
+        numeric_features_wo_miss = ['a', 'b', ]
+        categorial_features = ['c', 'f']
+
+        df_ = fill_missing_value(df.copy(), numeric_features)
+        print(df_)
+        df_ = fill_missing_value(df.copy(), categorial_features, strategy='constant', fill_value='hehe')
+        print(df_)
+
+        df_ = fill_missing_value(df.copy(), numeric_features, strategy='constant', fill_value=999)
+        print(df_)
+
+        # df_ = label_encode(df.copy(), numeric_features + categorial_features, )
+        # print(df_)
+
+        df_ = split_bins(df.copy(), numeric_features_wo_miss, strategy='quantile')
+        print(df_)
+
+        df_ = min_max_scale(df.copy(), numeric_features, )
+        print(df_)
+
+        df_ = standard_scale(df.copy(), numeric_features, )
+        print(df_)
+
+        df_ = log_transform(df.copy(), numeric_features, )
+        print(df_)
+
+        df_ = max_abs_scale(df.copy(), numeric_features, )
+        print(df_)
+
+        df_ = robust_scale(df.copy(), numeric_features, )
+        print(df_)
+    run()
\ No newline at end of file
diff --git a/metagpt/tools/functions/libs/ml_model.py b/metagpt/tools/functions/libs/ml_model.py
new file mode 100644
index 000000000..b669de2c1
--- /dev/null
+++ b/metagpt/tools/functions/libs/ml_model.py
@@ -0,0 +1,196 @@
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+
+
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import GradientBoostingRegressor
+
+from metagpt.tools.functions import registry
+from metagpt.tools.functions.schemas.ml_model import *
+
+
+#########
+## 分类 ##
+#########
+
+
+@registry.register("classification_model", LogisticRegressionClassification)
+def logistic_regression_classification(df, label, test_size=0.2, penalty='l2', dual=False):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+
+    model = LogisticRegression(penalty=penalty, dual=dual)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict_proba(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+@registry.register("classification_model", RandomForestClassification)
+def random_forest_classification(df, label, test_size=0.2, n_estimators=100, criterion='gini'):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict_proba(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+@registry.register("classification_model", GradientBoostingClassification)
+def gradient_boosting_classification(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict_proba(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+
+#########
+## 回归 ##
+#########
+
+
+@registry.register("regression_model", LinearRegressionRegression)
+def linear_regression(df, label, test_size=0.2, ):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+
+    model = LinearRegression()
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+@registry.register("regression_model", RandomForestRegression)
+def random_forest_regression(df, label, test_size=0.2, n_estimators=100, criterion='squared_error'):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+    model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+@registry.register("regression_model", GradientBoostingRegression)
+def gradient_boosting_regression(df, label, test_size=0.2, n_estimators=100, learning_rate=0.1):
+    nonnumeric_columns = [col for col in df if df[col].dtype == 'object']
+    for col in nonnumeric_columns:
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(0)
+
+    features = [col for col in df if col != label]
+    x, y = df[features], df[label]
+    tr_x, te_x, tr_y, te_y = train_test_split(x, y, test_size=test_size, random_state=1)
+    model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
+    model.fit(tr_x, tr_y, )
+    te_pred_prob = model.predict(te_x)
+
+    res = {
+        'te_pred_prob': te_pred_prob
+    }
+    return res
+
+
+if __name__ == '__main__':
+    def run():
+        from sklearn.datasets import load_iris
+        loader = load_iris(as_frame=True)
+        df = loader['data']
+        df['target'] = loader['target']
+
+        df[df.columns[0]] = df[df.columns[0]].astype(str)
+        df[df.columns[1]] = df[df.columns[1]].astype(int)
+        df['target'] = df['target'].astype(str)
+
+        print(df)
+        print('####'*5)
+        res = logistic_regression_classification(df, 'target', test_size=0.25, penalty='l2', dual=False)
+        print(res['te_pred_prob'])
+
+        print('####'*5)
+        res = random_forest_classification(df, 'target', test_size=0.25, n_estimators=100, criterion='gini')
+        print(res['te_pred_prob'])
+
+        print('####'*5)
+        res = gradient_boosting_classification(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
+        print(res['te_pred_prob'])
+
+        from sklearn.datasets import make_regression
+        import pandas as pd
+        loader = make_regression()
+        df = pd.DataFrame(loader[0])
+        df['target'] = loader[1]
+
+        df[df.columns[0]] = df[df.columns[0]].astype(str)
+        df[df.columns[1]] = df[df.columns[1]].astype(int)
+        # df['target'] = df['target'].astype(str)
+
+        print(df)
+        print('####' * 5)
+        res = linear_regression(df, 'target', test_size=0.25, )
+        print(res['te_pred_prob'])
+
+        print('####' * 5)
+        res = random_forest_regression(df, 'target', test_size=0.25, n_estimators=100, criterion='squared_error')
+        print(res['te_pred_prob'])
+
+        print('####' * 5)
+        res = gradient_boosting_regression(df, 'target', test_size=0.25, n_estimators=100, learning_rate=0.1)
+        print(res['te_pred_prob'])
+    run()
\ No newline at end of file
diff --git a/metagpt/tools/functions/schemas/data_preprocess.py b/metagpt/tools/functions/schemas/data_preprocess.py
new file mode 100644
index 000000000..40e1d64e0
--- /dev/null
+++ b/metagpt/tools/functions/schemas/data_preprocess.py
@@ -0,0 +1,62 @@
+
+import pandas as pd
+
+from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
+
+
+class FillMissingValue(ToolSchema):
+    """Completing missing values with simple strategies"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+    strategy: str = tool_field(description="the imputation strategy", default='mean')
+    fill_value: int = tool_field(description="fill_value is used to replace all occurrences of missing_values", default=None)
+
+
+# class LabelEncode(ToolSchema):
+#     """Completing missing values with simple strategies"""
+#     df: pd.DataFrame = tool_field(description="input dataframe")
+#     features: list = tool_field(description="columns to be processed")
+
+
+class SplitBins(ToolSchema):
+    """Bin continuous data into intervals and return the bin identifier encoded as an integer value"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+    strategy: str = tool_field(description="Strategy used to define the widths of the bins", default='quantile')
+
+
+class MinMaxScale(ToolSchema):
+    """Transform features by scaling each feature to a range, witch is (0, 1)"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class StandardScale(ToolSchema):
+    """Standardize features by removing the mean and scaling to unit variance"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class LogTransform(ToolSchema):
+    """Performs a logarithmic transformation on the specified columns"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class MaxAbsScale(ToolSchema):
+    """Scale each feature by its maximum absolute value"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class RobustScale(ToolSchema):
+    """Scale features using statistics that are robust to outliers, the quantile_range is (25.0, 75.0)"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
+
+class OrdinalEncode(ToolSchema):
+    """Encode categorical features as an integer array"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    features: list = tool_field(description="columns to be processed")
+
diff --git a/metagpt/tools/functions/schemas/ml_model.py b/metagpt/tools/functions/schemas/ml_model.py
new file mode 100644
index 000000000..9268156af
--- /dev/null
+++ b/metagpt/tools/functions/schemas/ml_model.py
@@ -0,0 +1,55 @@
+import pandas as pd
+
+from metagpt.tools.functions.schemas.base import tool_field, ToolSchema
+
+
+class LogisticRegressionClassification(ToolSchema):
+    """Logistic Regression (aka logit, MaxEnt) classifier"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    penalty: str = tool_field(description="Specify the norm of the penalty", default="l2")
+    dual: bool = tool_field(description="Dual (constrained) or primal (regularized) formulation", default="l2")
+
+
+class RandomForestClassification(ToolSchema):
+    """random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
+    criterion: str = tool_field(description="The function to measure the quality of a split", default="gini")
+
+
+class GradientBoostingClassification(ToolSchema):
+    """Gradient Boosting for classification.This algorithm builds an additive model in a forward stage-wise fashion"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
+    learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)
+
+
+class LinearRegressionRegression(ToolSchema):
+    """Ordinary least squares Linear Regression."""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+
+
+class RandomForestRegression(ToolSchema):
+    """random forest is a meta estimator that fits a number of decision tree on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    n_estimators: int = tool_field(description="The number of trees in the forest", default=100)
+    criterion: str = tool_field(description="The function to measure the quality of a split", default="squared_error")
+
+
+class GradientBoostingRegression(ToolSchema):
+    """Gradient Boosting for regression.This estimator builds an additive model in a forward stage-wise fashion"""
+    df: pd.DataFrame = tool_field(description="input dataframe")
+    label: str = tool_field(description="target name")
+    test_size: float = tool_field(description="The proportion of the test set to all the data", default=0.2)
+    n_estimators: int = tool_field(description="The number of boosting stages to perform", default=100)
+    learning_rate: float = tool_field(description="Learning rate shrinks the contribution of each tree by learning_rate", default=0.1)