Merge pull request #994 from garylin2099/di_fixes

update tests, rm unused
2026-07-14 16:32:16 +02:00 · 2024-03-12 21:21:23 +08:00 · 2024-03-12 21:21:23 +08:00 · 4ef53a2eb3
commit 4ef53a2eb3
parent de14b4a57d 3b001572d9
11 changed files with 74 additions and 514 deletions
--- a/metagpt/actions/init.py
+++ b/metagpt/actions/init.py
@ -23,7 +23,7 @@ from metagpt.actions.write_prd import WritePRD
 from metagpt.actions.write_prd_review import WritePRDReview
 from metagpt.actions.write_test import WriteTest
 from metagpt.actions.di.execute_nb_code import ExecuteNbCode
-from metagpt.actions.di.write_analysis_code import WriteCodeWithTools
+from metagpt.actions.di.write_analysis_code import WriteAnalysisCode
 from metagpt.actions.di.write_plan import WritePlan


@ -46,7 +46,7 @@ class ActionType(Enum):
    WEB_BROWSE_AND_SUMMARIZE = WebBrowseAndSummarize
    CONDUCT_RESEARCH = ConductResearch
    EXECUTE_NB_CODE = ExecuteNbCode
-    WRITE_CODE_WITH_TOOLS = WriteCodeWithTools
+    WRITE_ANALYSIS_CODE = WriteAnalysisCode
    WRITE_PLAN = WritePlan


--- a/metagpt/actions/di/write_analysis_code.py
+++ b/metagpt/actions/di/write_analysis_code.py
@ -21,9 +21,7 @@ from metagpt.schema import Message, Plan
 from metagpt.utils.common import CodeParser, process_message, remove_comments


-class WriteCodeWithTools(Action):
-    """Write code with help of local available tools. Choose tools first, then generate code to use the tools"""
-
+class WriteAnalysisCode(Action):
    async def _debug_with_reflection(self, context: list[Message], working_memory: list[Message]):
        reflection_prompt = REFLECTION_PROMPT.format(
            debug_example=DEBUG_REFLECTION_EXAMPLE,
@ -67,7 +65,7 @@ class WriteCodeWithTools(Action):


 class CheckData(Action):
-    async def run(self, plan: Plan = None) -> dict:
+    async def run(self, plan: Plan) -> dict:
        finished_tasks = plan.get_finished_tasks()
        code_written = [remove_comments(task.code) for task in finished_tasks]
        code_written = "\n\n".join(code_written)
--- a/metagpt/actions/di/write_plan.py
+++ b/metagpt/actions/di/write_plan.py
@ -41,7 +41,7 @@ class WritePlan(Action):
    ```
    """

-    async def run(self, context: list[Message], max_tasks: int = 5, use_tools: bool = False) -> str:
+    async def run(self, context: list[Message], max_tasks: int = 5) -> str:
        task_type_desc = "\n".join([f"- **{tt.type_name}**: {tt.value.desc}" for tt in TaskType])
        prompt = self.PROMPT_TEMPLATE.format(
            context="\n".join([str(ct) for ct in context]), max_tasks=max_tasks, task_type_desc=task_type_desc
@ -51,14 +51,10 @@ class WritePlan(Action):
        return rsp


-def rsp_to_tasks(rsp: str) -> list[Task]:
+def update_plan_from_rsp(rsp: str, current_plan: Plan):
    rsp = json.loads(rsp)
    tasks = [Task(**task_config) for task_config in rsp]
-    return tasks

-
-def update_plan_from_rsp(rsp: str, current_plan: Plan):
-    tasks = rsp_to_tasks(rsp)
    if len(tasks) == 1 or tasks[0].dependent_task_ids:
        if tasks[0].dependent_task_ids and len(tasks) > 1:
            # tasks[0].dependent_task_ids means the generated tasks are not a complete plan
--- a/metagpt/roles/di/data_interpreter.py
+++ b/metagpt/roles/di/data_interpreter.py
@ -7,7 +7,7 @@ from pydantic import Field, model_validator

 from metagpt.actions.di.ask_review import ReviewConst
 from metagpt.actions.di.execute_nb_code import ExecuteNbCode
-from metagpt.actions.di.write_analysis_code import CheckData, WriteCodeWithTools
+from metagpt.actions.di.write_analysis_code import CheckData, WriteAnalysisCode
 from metagpt.logs import logger
 from metagpt.prompts.di.write_analysis_code import DATA_INFO
 from metagpt.roles import Role
@ -52,7 +52,7 @@ class DataInterpreter(Role):
        )  # create a flag for convenience, overwrite any passed-in value
        if self.tools:
            self.tool_recommender = BM25ToolRecommender(tools=self.tools)
-        self.set_actions([WriteCodeWithTools])
+        self.set_actions([WriteAnalysisCode])
        return self

    @property
@ -82,11 +82,12 @@ class DataInterpreter(Role):
    async def _act(self) -> Message:
        """Useful in 'react' mode. Return a Message conforming to Role._act interface."""
        code, _, _ = await self._write_and_exec_code()
-        return Message(content=code, role="assistant", cause_by=WriteCodeWithTools)
+        return Message(content=code, role="assistant", cause_by=WriteAnalysisCode)

    async def _plan_and_act(self) -> Message:
-        await super()._plan_and_act()
+        rsp = await super()._plan_and_act()
        await self.execute_code.terminate()
+        return rsp

    async def _act_on_task(self, current_task: Task) -> TaskResult:
        """Useful in 'plan_and_act' mode. Wrap the output in a TaskResult for review and confirmation."""
@ -143,7 +144,7 @@ class DataInterpreter(Role):
        plan_status="",
        tool_info="",
    ):
-        todo = WriteCodeWithTools()
+        todo = WriteAnalysisCode()
        logger.info(f"ready to {todo.name}")
        use_reflection = counter > 0 and self.use_reflection

--- a/tests/metagpt/actions/di/test_debug_code.py
+++ b/tests/metagpt/actions/di/test_debug_code.py
@ -1,51 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Date    : 1/11/2024 8:51 PM
-# @Author  : stellahong (stellahong@fuzhi.ai)
-# @Desc    :
-
-import pytest
-
-from metagpt.actions.di.debug_code import DebugCode
-from metagpt.schema import Message
-
-ErrorStr = """Tested passed:
-
-Tests failed:
-assert sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5] # output: [1, 2, 4, 3, 5]
-"""
-
-CODE = """
-def sort_array(arr):
-    # Helper function to count the number of ones in the binary representation
-    def count_ones(n):
-        return bin(n).count('1')
-    
-    # Sort the array using a custom key function
-    # The key function returns a tuple (number of ones, value) for each element
-    # This ensures that if two elements have the same number of ones, they are sorted by their value
-    sorted_arr = sorted(arr, key=lambda x: (count_ones(x), x))
-    
-    return sorted_arr
-```
-"""
-
-DebugContext = '''Solve the problem in Python:
-def sort_array(arr):
-    """
-    In this Kata, you have to sort an array of non-negative integers according to
-    number of ones in their binary representation in ascending order.
-    For similar number of ones, sort based on decimal value.
-
-    It must be implemented like this:
-    >>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]
-    >>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]
-    >>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]
-    """
-'''
-
-
-@pytest.mark.asyncio
-async def test_debug_code():
-    debug_context = Message(content=DebugContext)
-    new_code = await DebugCode().run(context=debug_context, code=CODE, runtime_result=ErrorStr)
-    assert "def sort_array(arr)" in new_code["code"]
--- a/tests/metagpt/actions/di/test_ml_action.py
+++ b/tests/metagpt/actions/di/test_ml_action.py
@ -1,46 +0,0 @@
-import pytest
-
-from metagpt.actions.di.ml_action import WriteCodeWithToolsML
-from metagpt.schema import Plan, Task
-
-
-@pytest.mark.asyncio
-async def test_write_code_with_tools():
-    write_code_ml = WriteCodeWithToolsML()
-
-    task_map = {
-        "1": Task(
-            task_id="1",
-            instruction="随机生成一个pandas DataFrame数据集",
-            task_type="other",
-            dependent_task_ids=[],
-            code="""
-                import pandas as pd
-                df = pd.DataFrame({
-                    'a': [1, 2, 3, 4, 5],
-                    'b': [1.1, 2.2, 3.3, 4.4, np.nan],
-                    'c': ['aa', 'bb', 'cc', 'dd', 'ee'],
-                    'd': [1, 2, 3, 4, 5]
-                })
-                """,
-            is_finished=True,
-        ),
-        "2": Task(
-            task_id="2",
-            instruction="对数据集进行数据清洗",
-            task_type="data_preprocess",
-            dependent_task_ids=["1"],
-        ),
-    }
-    plan = Plan(
-        goal="构造数据集并进行数据清洗",
-        tasks=list(task_map.values()),
-        task_map=task_map,
-        current_task_id="2",
-    )
-    column_info = ""
-
-    _, code_with_ml = await write_code_ml.run([], plan, column_info)
-    code_with_ml = code_with_ml["code"]
-    assert len(code_with_ml) > 0
-    print(code_with_ml)
--- a/tests/metagpt/actions/di/test_write_analysis_code.py
+++ b/tests/metagpt/actions/di/test_write_analysis_code.py
@ -1,134 +1,41 @@
-import asyncio
-
 import pytest

-from metagpt.actions.di.execute_nb_code import ExecuteNbCode
-from metagpt.actions.di.write_analysis_code import (
-    WriteCodeWithoutTools,
-    WriteCodeWithTools,
-)
-from metagpt.logs import logger
-from metagpt.schema import Message, Plan, Task
-from metagpt.strategy.planner import STRUCTURAL_CONTEXT
-
-
-@pytest.mark.skip
-@pytest.mark.asyncio
-async def test_write_code_by_list_plan():
-    write_code = WriteCodeWithoutTools()
-    execute_code = ExecuteNbCode()
-    messages = []
-    plan = ["随机生成一个pandas DataFrame时间序列", "绘制这个时间序列的直方图", "回顾已完成的任务", "求均值", "总结"]
-    for task in plan:
-        print(f"\n任务: {task}\n\n")
-        messages.append(Message(task, role="assistant"))
-        code = await write_code.run(messages)
-        if task.startswith(("回顾", "总结")):
-            assert code["language"] == "markdown"
-        else:
-            assert code["language"] == "python"
-        messages.append(Message(code["code"], role="assistant"))
-        assert len(code) > 0
-        output, _ = await execute_code.run(**code)
-        print(f"\n[Output]: 任务{task}的执行结果是: \n{output}\n")
-        messages.append(output)
+from metagpt.actions.di.write_analysis_code import WriteAnalysisCode
+from metagpt.schema import Message


@pytest.mark.asyncio
-async def test_tool_recommendation():
-    task = "clean and preprocess the data"
-    available_tools = {
-        "FillMissingValue": "Filling missing values",
-        "SplitBins": "Bin continuous data into intervals and return the bin identifier encoded as an integer value",
-    }
-    write_code = WriteCodeWithTools()
-    tools = await write_code._recommend_tool(task, available_tools)
+async def test_write_code():
+    write_code = WriteAnalysisCode()

-    assert len(tools) == 1
-    assert "FillMissingValue" in tools
+    user_requirement = "Run data analysis on sklearn Iris dataset, include a plot"
+    plan_status = "\n## Finished Tasks\n### code\n```python\n\n```\n\n### execution result\n\n\n## Current Task\nLoad the sklearn Iris dataset and perform exploratory data analysis\n\n## Task Guidance\nWrite complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.\nSpecifically, \nThe current task is about exploratory data analysis, please note the following:\n- Distinguish column types with `select_dtypes` for tailored analysis and visualization, such as correlation.\n- Remember to `import numpy as np` before using Numpy functions.\n\n"

-
-@pytest.mark.asyncio
-async def test_write_code_with_tools():
-    write_code = WriteCodeWithTools()
-
-    requirement = "构造数据集并进行数据清洗"
-    task_map = {
-        "1": Task(
-            task_id="1",
-            instruction="随机生成一个pandas DataFrame数据集",
-            task_type="other",
-            dependent_task_ids=[],
-            code="""
-                import pandas as pd
-                df = pd.DataFrame({
-                    'a': [1, 2, 3, 4, 5],
-                    'b': [1.1, 2.2, 3.3, 4.4, np.nan],
-                    'c': ['aa', 'bb', 'cc', 'dd', 'ee'],
-                    'd': [1, 2, 3, 4, 5]
-                })
-                """,
-            is_finished=True,
-        ),
-        "2": Task(
-            task_id="2",
-            instruction="对数据集进行数据清洗",
-            task_type="data_preprocess",
-            dependent_task_ids=["1"],
-        ),
-    }
-    plan = Plan(
-        goal="构造数据集并进行数据清洗",
-        tasks=list(task_map.values()),
-        task_map=task_map,
-        current_task_id="2",
-    )
-
-    context = STRUCTURAL_CONTEXT.format(
-        user_requirement=requirement,
-        context=plan.context,
-        tasks=list(task_map.values()),
-        current_task=plan.current_task.model_dump_json(),
-    )
-    context_msg = [Message(content=context, role="user")]
-
-    code = await write_code.run(context_msg, plan)
-    code = code["code"]
+    code = await write_code.run(user_requirement=user_requirement, plan_status=plan_status)
    assert len(code) > 0
-    print(code)
+    assert "sklearn" in code


@pytest.mark.asyncio
-async def test_write_code_to_correct_error():
-    structural_context = """
-    ## User Requirement
-    read a dataset test.csv and print its head
-    ## Current Plan
-    [
-        {
-            "task_id": "1",
-            "dependent_task_ids": [],
-            "instruction": "import pandas and load the dataset from 'test.csv'.",
-            "task_type": "",
-            "code": "",
-            "result": "",
-            "is_finished": false
-        },
-        {
-            "task_id": "2",
-            "dependent_task_ids": [
-                "1"
-            ],
-            "instruction": "Print the head of the dataset to display the first few rows.",
-            "task_type": "",
-            "code": "",
-            "result": "",
-            "is_finished": false
-        }
-    ]
+async def test_debug_with_reflection():
+    user_requirement = "Run data analysis on sklearn Iris dataset, include a plot"
+
+    plan_status = """
+    ## Finished Tasks
+    ### code
+    ```python
+    ```
+
+    ### execution result
+
    ## Current Task
-    {"task_id": "1", "dependent_task_ids": [], "instruction": "import pandas and load the dataset from 'test.csv'.", "task_type": "", "code": "", "result": "", "is_finished": false}
+    import pandas and load the dataset from 'test.csv'.
+
+    ## Task Guidance
+    Write complete code for 'Current Task'. And avoid duplicating code from 'Finished Tasks', such as repeated import of packages, reading data, etc.
+    Specifically, 
    """
+
    wrong_code = """import pandas as pd\ndata = pd.read_excel('test.csv')\ndata"""  # use read_excel to read a csv
    error = """
    Traceback (most recent call last):
@ -139,186 +46,14 @@ async def test_write_code_to_correct_error():
            raise ValueError(
        ValueError: Excel file format cannot be determined, you must specify an engine manually.
    """
-    context = [
-        Message(content=structural_context, role="user"),
+    working_memory = [
        Message(content=wrong_code, role="assistant"),
        Message(content=error, role="user"),
    ]
-    new_code = await WriteCodeWithoutTools().run(context=context)
-    new_code = new_code["code"]
-    print(new_code)
+    new_code = await WriteAnalysisCode().run(
+        user_requirement=user_requirement,
+        plan_status=plan_status,
+        working_memory=working_memory,
+        use_reflection=True,
+    )
    assert "read_csv" in new_code  # should correct read_excel to read_csv
-
-
-@pytest.mark.asyncio
-async def test_write_code_reuse_code_simple():
-    structural_context = """
-    ## User Requirement
-    read a dataset test.csv and print its head
-    ## Current Plan
-    [
-        {
-            "task_id": "1",
-            "dependent_task_ids": [],
-            "instruction": "import pandas and load the dataset from 'test.csv'.",
-            "task_type": "",
-            "code": "import pandas as pd\ndata = pd.read_csv('test.csv')",
-            "result": "",
-            "is_finished": true
-        },
-        {
-            "task_id": "2",
-            "dependent_task_ids": [
-                "1"
-            ],
-            "instruction": "Print the head of the dataset to display the first few rows.",
-            "task_type": "",
-            "code": "",
-            "result": "",
-            "is_finished": false
-        }
-    ]
-    ## Current Task
-    {"task_id": "2", "dependent_task_ids": ["1"], "instruction": "Print the head of the dataset to display the first few rows.", "task_type": "", "code": "", "result": "", "is_finished": false}
-    """
-    context = [
-        Message(content=structural_context, role="user"),
-    ]
-    code = await WriteCodeWithoutTools().run(context=context)
-    code = code["code"]
-    print(code)
-    assert "pandas" not in code and "read_csv" not in code  # should reuse import and read statement from previous one
-
-
-@pytest.mark.skip
-@pytest.mark.asyncio
-async def test_write_code_reuse_code_long():
-    """test code reuse for long context"""
-
-    structural_context = """
-    ## User Requirement
-    Run data analysis on sklearn Iris dataset, include a plot
-    ## Current Plan
-    [
-        {
-            "task_id": "1",
-            "dependent_task_ids": [],
-            "instruction": "Load the Iris dataset from sklearn.",
-            "task_type": "",
-            "code": "from sklearn.datasets import load_iris\niris_data = load_iris()\niris_data['data'][0:5], iris_data['target'][0:5]",
-            "result": "(array([[5.1, 3.5, 1.4, 0.2],\n        [4.9, 3. , 1.4, 0.2],\n        [4.7, 3.2, 1.3, 0.2],\n        [4.6, 3.1, 1.5, 0.2],\n        [5. , 3.6, 1.4, 0.2]]),\n array([0, 0, 0, 0, 0]))",
-            "is_finished": true
-        },
-        {
-            "task_id": "2",
-            "dependent_task_ids": [
-                "1"
-            ],
-            "instruction": "Perform exploratory data analysis on the Iris dataset.",
-            "task_type": "",
-            "code": "",
-            "result": "",
-            "is_finished": false
-        },
-        {
-            "task_id": "3",
-            "dependent_task_ids": [
-                "2"
-            ],
-            "instruction": "Create a plot visualizing the Iris dataset features.",
-            "task_type": "",
-            "code": "",
-            "result": "",
-            "is_finished": false
-        }
-    ]
-    ## Current Task
-    {"task_id": "2", "dependent_task_ids": ["1"], "instruction": "Perform exploratory data analysis on the Iris dataset.", "task_type": "", "code": "", "result": "", "is_finished": false}
-    """
-    context = [
-        Message(content=structural_context, role="user"),
-    ]
-    trials_num = 5
-    trials = [WriteCodeWithoutTools().run(context=context, temperature=0.0) for _ in range(trials_num)]
-    trial_results = await asyncio.gather(*trials)
-    print(*trial_results, sep="\n\n***\n\n")
-    success = [
-        "load_iris" not in result["code"] and "iris_data" in result["code"] for result in trial_results
-    ]  # should reuse iris_data from previous tasks
-    success_rate = sum(success) / trials_num
-    logger.info(f"success rate: {success_rate :.2f}")
-    assert success_rate >= 0.8
-
-
-@pytest.mark.skip
-@pytest.mark.asyncio
-async def test_write_code_reuse_code_long_for_wine():
-    """test code reuse for long context"""
-
-    structural_context = """
-    ## User Requirement
-    Run data analysis on sklearn Wisconsin Breast Cancer dataset, include a plot, train a model to predict targets (20% as validation), and show validation accuracy
-    ## Current Plan
-    [
-        {
-            "task_id": "1",
-            "dependent_task_ids": [],
-            "instruction": "Load the sklearn Wine recognition dataset and perform exploratory data analysis."
-            "task_type": "",
-            "code": "from sklearn.datasets import load_wine\n# Load the Wine recognition dataset\nwine_data = load_wine()\n# Perform exploratory data analysis\nwine_data.keys()",
-            "result": "Truncated to show only the last 1000 characters\ndict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])",
-            "is_finished": true
-        },
-        {
-            "task_id": "2",
-            "dependent_task_ids": ["1"],
-            "instruction": "Create a plot to visualize some aspect of the wine dataset."
-            "task_type": "",
-            "code": "",
-            "result": "",
-            "is_finished": false
-        },
-        {
-            "task_id": "3",
-            "dependent_task_ids": ["1"],
-            "instruction": "Split the dataset into training and validation sets with a 20% validation size.",
-            "task_type": "",
-            "code": "",
-            "result": "",
-            "is_finished": false
-        },
-        {
-            "task_id": "4",
-            "dependent_task_ids": ["3"],
-            "instruction": "Train a model on the training set to predict wine class.",
-            "task_type": "",
-            "code": "",
-            "result": "",
-            "is_finished": false
-        },
-        {
-            "task_id": "5",
-            "dependent_task_ids": ["4"],
-            "instruction": "Evaluate the model on the validation set and report the accuracy.",
-            "task_type": "",
-            "code": "",
-            "result": "",
-            "is_finished": false
-        }
-    ]
-    ## Current Task
-    {"task_id": "2", "dependent_task_ids": ["1"], "instruction": "Create a plot to visualize some aspect of the Wine dataset.", "task_type": "", "code": "", "result": "", "is_finished": false}
-    """
-    context = [
-        Message(content=structural_context, role="user"),
-    ]
-    trials_num = 5
-    trials = [WriteCodeWithoutTools().run(context=context, temperature=0.0) for _ in range(trials_num)]
-    trial_results = await asyncio.gather(*trials)
-    print(*trial_results, sep="\n\n***\n\n")
-    success = [
-        "load_wine" not in result["code"] and "wine_data" in result["code"] for result in trial_results
-    ]  # should reuse iris_data from previous tasks
-    success_rate = sum(success) / trials_num
-    logger.info(f"success rate: {success_rate :.2f}")
-    assert success_rate >= 0.8
--- a/tests/metagpt/actions/di/test_write_plan.py
+++ b/tests/metagpt/actions/di/test_write_plan.py
@ -23,12 +23,10 @@ def test_precheck_update_plan_from_rsp():


@pytest.mark.asyncio
-@pytest.mark.parametrize("use_tools", [(False), (True)])
-async def test_write_plan(use_tools):
+async def test_write_plan():
    rsp = await WritePlan().run(
-        context=[Message("run analysis on sklearn iris dataset", role="user")], use_tools=use_tools
+        context=[Message("Run data analysis on sklearn Iris dataset, include a plot", role="user")]
    )

    assert "task_id" in rsp
    assert "instruction" in rsp
-    assert "json" not in rsp  # the output should be the content inside ```json ```
--- a/tests/metagpt/roles/di/test_data_interpreter.py
+++ b/tests/metagpt/roles/di/test_data_interpreter.py
@ -10,10 +10,9 @@ async def test_interpreter(mocker, auto_run):
    mocker.patch("metagpt.actions.di.execute_nb_code.ExecuteNbCode.run", return_value=("a successful run", True))
    mocker.patch("builtins.input", return_value="confirm")

-    requirement = "Run data analysis on sklearn Iris dataset, include a plot"
-    tools = []
+    requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy."

-    di = DataInterpreter(auto_run=auto_run, use_tools=True, tools=tools)
+    di = DataInterpreter(auto_run=auto_run)
    rsp = await di.run(requirement)
    logger.info(rsp)
    assert len(rsp.content) > 0
@ -21,3 +20,15 @@ async def test_interpreter(mocker, auto_run):
    finished_tasks = di.planner.plan.get_finished_tasks()
    assert len(finished_tasks) > 0
    assert len(finished_tasks[0].code) > 0  # check one task to see if code is recorded
+
+
+async def test_interpreter_react_mode(mocker):
+    mocker.patch("metagpt.actions.di.execute_nb_code.ExecuteNbCode.run", return_value=("a successful run", True))
+    mocker.patch("builtins.input", return_value="confirm")
+
+    requirement = "Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy."
+
+    di = DataInterpreter(react_mode="react")
+    rsp = await di.run(requirement)
+    logger.info(rsp)
+    assert len(rsp.content) > 0
--- a/tests/metagpt/roles/di/test_ml_engineer.py
+++ b/tests/metagpt/roles/di/test_ml_engineer.py
@ -1,90 +0,0 @@
-import pytest
-
-from metagpt.actions.di.execute_nb_code import ExecuteNbCode
-from metagpt.logs import logger
-from metagpt.roles.di.ml_engineer import MLEngineer
-from metagpt.schema import Message, Plan, Task
-from metagpt.tools.tool_type import ToolType
-from tests.metagpt.actions.di.test_debug_code import CODE, DebugContext, ErrorStr
-
-
-def test_mle_init():
-    mle = MLEngineer(goal="test", auto_run=True, use_tools=True, tools=["tool1", "tool2"])
-    assert mle.tools == []
-
-
-MockPlan = Plan(
-    goal="This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: 'tests/data/ml_datasets/titanic/split_train.csv', eval data path: 'tests/data/ml_datasets/titanic/split_eval.csv'.",
-    context="",
-    tasks=[
-        Task(
-            task_id="1",
-            dependent_task_ids=[],
-            instruction="Perform exploratory data analysis on the train dataset to understand the features and target variable.",
-            task_type="eda",
-            code="",
-            result="",
-            is_success=False,
-            is_finished=False,
-        )
-    ],
-    task_map={
-        "1": Task(
-            task_id="1",
-            dependent_task_ids=[],
-            instruction="Perform exploratory data analysis on the train dataset to understand the features and target variable.",
-            task_type="eda",
-            code="",
-            result="",
-            is_success=False,
-            is_finished=False,
-        )
-    },
-    current_task_id="1",
-)
-
-
-@pytest.mark.asyncio
-async def test_mle_write_code(mocker):
-    data_path = "tests/data/ml_datasets/titanic"
-
-    mle = MLEngineer(auto_run=True, use_tools=True)
-    mle.planner.plan = MockPlan
-
-    code, _ = await mle._write_code()
-    assert data_path in code["code"]
-
-
-@pytest.mark.asyncio
-async def test_mle_update_data_columns(mocker):
-    mle = MLEngineer(auto_run=True, use_tools=True)
-    mle.planner.plan = MockPlan
-
-    # manually update task type to test update
-    mle.planner.plan.current_task.task_type = ToolType.DATA_PREPROCESS.value
-
-    result = await mle._update_data_columns()
-    assert result is not None
-
-
-@pytest.mark.asyncio
-async def test_mle_debug_code(mocker):
-    mle = MLEngineer(auto_run=True, use_tools=True)
-    mle.working_memory.add(Message(content=ErrorStr, cause_by=ExecuteNbCode))
-    mle.latest_code = CODE
-    mle.debug_context = DebugContext
-    code, _ = await mle._write_code()
-    assert len(code) > 0
-
-
-@pytest.mark.skip
-@pytest.mark.asyncio
-async def test_ml_engineer():
-    data_path = "tests/data/ml_datasets/titanic"
-    requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."
-    tools = ["FillMissingValue", "CatCross", "dummy_tool"]
-
-    mle = MLEngineer(auto_run=True, use_tools=True, tools=tools)
-    rsp = await mle.run(requirement)
-    logger.info(rsp)
-    assert len(rsp.content) > 0
--- a/tests/mock/mock_llm.py
+++ b/tests/mock/mock_llm.py
@ -32,14 +32,13 @@ class MockLLM(OriginalLLM):

    async def original_aask(
        self,
-        msg: str,
+        msg: Union[str, list[dict[str, str]]],
        system_msgs: Optional[list[str]] = None,
        format_msgs: Optional[list[dict[str, str]]] = None,
        images: Optional[Union[str, list[str]]] = None,
        timeout=3,
        stream=True,
-    ):
-        """A copy of metagpt.provider.base_llm.BaseLLM.aask, we can't use super().aask because it will be mocked"""
+    ) -> str:
        if system_msgs:
            message = self._system_msgs(system_msgs)
        else:
@ -48,7 +47,11 @@ class MockLLM(OriginalLLM):
            message = []
        if format_msgs:
            message.extend(format_msgs)
-        message.append(self._user_msg(msg, images=images))
+        if isinstance(msg, str):
+            message.append(self._user_msg(msg, images=images))
+        else:
+            message.extend(msg)
+        logger.debug(message)
        rsp = await self.acompletion_text(message, stream=stream, timeout=timeout)
        return rsp

@ -72,14 +75,19 @@ class MockLLM(OriginalLLM):

    async def aask(
        self,
-        msg: str,
+        msg: Union[str, list[dict[str, str]]],
        system_msgs: Optional[list[str]] = None,
        format_msgs: Optional[list[dict[str, str]]] = None,
        images: Optional[Union[str, list[str]]] = None,
        timeout=3,
        stream=True,
    ) -> str:
-        msg_key = msg  # used to identify it a message has been called before
+        # used to identify it a message has been called before
+        if isinstance(msg, list):
+            msg_key = "#MSG_SEP#".join([m["content"] for m in msg])
+        else:
+            msg_key = msg
+
        if system_msgs:
            joined_system_msg = "#MSG_SEP#".join(system_msgs) + "#SYSTEM_MSG_END#"
            msg_key = joined_system_msg + msg_key