feat: merge geekan:main

2026-06-23 15:48:11 +02:00 · 2024-03-05 10:59:34 +08:00 · 2024-03-05 10:59:34 +08:00 · e22a28215d
commit e22a28215d
parent 3b1644b7ff 0e63b92883
102 changed files with 1766 additions and 756 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -14,6 +14,7 @@ import re
 import uuid
 from typing import Callable

+import aiohttp.web
 import pytest

 from metagpt.const import DEFAULT_WORKSPACE_ROOT, TEST_DATA_PATH
@ -171,9 +172,8 @@ def new_filename(mocker):
    yield mocker


-@pytest.fixture(scope="session")
-def search_rsp_cache():
-    rsp_cache_file_path = TEST_DATA_PATH / "search_rsp_cache.json"  # read repo-provided
+def _rsp_cache(name):
+    rsp_cache_file_path = TEST_DATA_PATH / f"{name}.json"  # read repo-provided
    if os.path.exists(rsp_cache_file_path):
        with open(rsp_cache_file_path, "r") as f1:
            rsp_cache_json = json.load(f1)
@ -184,6 +184,16 @@ def search_rsp_cache():
        json.dump(rsp_cache_json, f2, indent=4, ensure_ascii=False)


+@pytest.fixture(scope="session")
+def search_rsp_cache():
+    yield from _rsp_cache("search_rsp_cache")
+
+
+@pytest.fixture(scope="session")
+def mermaid_rsp_cache():
+    yield from _rsp_cache("mermaid_rsp_cache")
+
+
@pytest.fixture
 def aiohttp_mocker(mocker):
    MockResponse = type("MockResponse", (MockAioResponse,), {})
@ -231,3 +241,32 @@ def search_engine_mocker(aiohttp_mocker, curl_cffi_mocker, httplib2_mocker, sear
    aiohttp_mocker.rsp_cache = httplib2_mocker.rsp_cache = curl_cffi_mocker.rsp_cache = search_rsp_cache
    aiohttp_mocker.check_funcs = httplib2_mocker.check_funcs = curl_cffi_mocker.check_funcs = check_funcs
    yield check_funcs
+
+
+@pytest.fixture
+def http_server():
+    async def handler(request):
+        return aiohttp.web.Response(
+            text="""<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">
+            <title>MetaGPT</title></head><body><h1>MetaGPT</h1></body></html>""",
+            content_type="text/html",
+        )
+
+    async def start():
+        server = aiohttp.web.Server(handler)
+        runner = aiohttp.web.ServerRunner(server)
+        await runner.setup()
+        site = aiohttp.web.TCPSite(runner, "localhost", 0)
+        await site.start()
+        host, port = site._server.sockets[0].getsockname()
+        return site, f"http://{host}:{port}"
+
+    return start
+
+
+@pytest.fixture
+def mermaid_mocker(aiohttp_mocker, mermaid_rsp_cache):
+    check_funcs: dict[tuple[str, str], Callable[[dict], str]] = {}
+    aiohttp_mocker.rsp_cache = mermaid_rsp_cache
+    aiohttp_mocker.check_funcs = check_funcs
+    yield check_funcs
--- a/tests/data/mermaid_rsp_cache.json
+++ b/tests/data/mermaid_rsp_cache.json
--- a/tests/data/rsp_cache.json
+++ b/tests/data/rsp_cache.json
@ -355,7 +355,7 @@
        "code": "print(data.head())"
    },
    "\n    # Context:\n    user: run analysis on sklearn iris dataset\n    # Task:\n    Based on the context, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to 5 tasks.\n    If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.\n    If you encounter errors on the current task, revise and output the current single task only.\n    Output a list of jsons following the format:\n    ```json\n    [\n        {\n            \"task_id\": str = \"unique identifier for a task in plan, can be an ordinal\",\n            \"dependent_task_ids\": list[str] = \"ids of tasks prerequisite to this task\",\n            \"instruction\": \"what you should do in this task, one short phrase or sentence\",\n        },\n        ...\n    ]\n    ```\n    ": "```json\n[\n    {\n        \"task_id\": \"1\",\n        \"dependent_task_ids\": [],\n        \"instruction\": \"Import the Iris dataset from sklearn.datasets\"\n    },\n    {\n        \"task_id\": \"2\",\n        \"dependent_task_ids\": [\"1\"],\n        \"instruction\": \"Perform exploratory data analysis to understand the dataset\"\n    },\n    {\n        \"task_id\": \"3\",\n        \"dependent_task_ids\": [\"2\"],\n        \"instruction\": \"Preprocess the data to prepare it for modeling\"\n    },\n    {\n        \"task_id\": \"4\",\n        \"dependent_task_ids\": [\"3\"],\n        \"instruction\": \"Split the dataset into training and testing sets\"\n    },\n    {\n        \"task_id\": \"5\",\n        \"dependent_task_ids\": [\"4\"],\n        \"instruction\": \"Train a classifier using the training set and evaluate it using the test set\"\n    }\n]\n```",
-    "[{\"role\": \"user\", \"content\": \"\\nPlease assign a task type to each task in the list below from the given categories:\\nTask 1: Import the Iris dataset from sklearn.datasets\\nTask 2: Perform exploratory data analysis to understand the dataset\\nTask 3: Preprocess the data to prepare it for modeling\\nTask 4: Split the dataset into training and testing sets\\nTask 5: Train a classifier using the training set and evaluate it using the test set\\n\\n## All Task Type:\\n- **eda**: For performing exploratory data analysis\\n- **data_preprocess**: Only for changing value inplace.\\n- **feature_engineering**: Only for creating new columns for input data.\\n- **model_train**: Only for training model.\\n- **model_evaluate**: Only for evaluating model.\\n- **stable_diffusion**: Related to text2image, image2image using stable diffusion model.\\n- **image2webpage**: For converting image into webpage code.\\n- **web_scraping**: For scraping data from web pages.\\n- **other**: Any tools not in the defined categories\\n\"}]": {
+    "[{\"role\": \"user\", \"content\": \"\\nPlease assign a task type to each task in the list below from the given categories:\\nTask 1: Import the Iris dataset from sklearn.datasets\\nTask 2: Perform exploratory data analysis to understand the dataset\\nTask 3: Preprocess the data to prepare it for modeling\\nTask 4: Split the dataset into training and testing sets\\nTask 5: Train a classifier using the training set and evaluate it using the test set\\n\\n## All Task Type:\\n- **eda**: For performing exploratory data analysis\\n- **data_preprocess**: Only for changing value inplace.\\n- **email_login**: For logging to an email.\\n- **feature_engineering**: Only for creating new columns for input data.\\n- **model_train**: Only for training model.\\n- **model_evaluate**: Only for evaluating model.\\n- **stable_diffusion**: Related to text2image, image2image using stable diffusion model.\\n- **image2webpage**: For converting image into webpage code.\\n- **web_scraping**: For scraping data from web pages.\\n- **other**: Any tools not in the defined categories\\n\"}]": {
        "task_type": [
            "other",
            "eda",
@ -366,24 +366,27 @@
        ]
    },
    "\n    # Context:\n    user: \n## User Requirement\nRun data analysis on sklearn Iris dataset, include a plot\n## Context\n\n## Current Plan\n[]\n## Current Task\n{}\n\n    # Task:\n    Based on the context, write a plan or modify an existing plan of what you should do to achieve the goal. A plan consists of one to 3 tasks.\n    If you are modifying an existing plan, carefully follow the instruction, don't make unnecessary changes. Give the whole plan unless instructed to modify only one task of the plan.\n    If you encounter errors on the current task, revise and output the current single task only.\n    Output a list of jsons following the format:\n    ```json\n    [\n        {\n            \"task_id\": str = \"unique identifier for a task in plan, can be an ordinal\",\n            \"dependent_task_ids\": list[str] = \"ids of tasks prerequisite to this task\",\n            \"instruction\": \"what you should do in this task, one short phrase or sentence\",\n        },\n        ...\n    ]\n    ```\n    ": "```json\n[\n    {\n        \"task_id\": \"1\",\n        \"dependent_task_ids\": [],\n        \"instruction\": \"Load the sklearn Iris dataset.\"\n    },\n    {\n        \"task_id\": \"2\",\n        \"dependent_task_ids\": [\"1\"],\n        \"instruction\": \"Perform exploratory data analysis on the Iris dataset.\"\n    },\n    {\n        \"task_id\": \"3\",\n        \"dependent_task_ids\": [\"2\"],\n        \"instruction\": \"Create a plot visualizing the Iris dataset.\"\n    }\n]\n```",
-    "[{\"role\": \"user\", \"content\": \"\\nPlease assign a task type to each task in the list below from the given categories:\\nTask 1: Load the sklearn Iris dataset.\\nTask 2: Perform exploratory data analysis on the Iris dataset.\\nTask 3: Create a plot visualizing the Iris dataset.\\n\\n## All Task Type:\\n- **eda**: For performing exploratory data analysis\\n- **data_preprocess**: Only for changing value inplace.\\n- **feature_engineering**: Only for creating new columns for input data.\\n- **model_train**: Only for training model.\\n- **model_evaluate**: Only for evaluating model.\\n- **stable_diffusion**: Related to text2image, image2image using stable diffusion model.\\n- **image2webpage**: For converting image into webpage code.\\n- **web_scraping**: For scraping data from web pages.\\n- **other**: Any tools not in the defined categories\\n\"}]": {
+    "[{\"role\": \"user\", \"content\": \"\\nPlease assign a task type to each task in the list below from the given categories:\\nTask 1: Load the sklearn Iris dataset.\\nTask 2: Perform exploratory data analysis on the Iris dataset.\\nTask 3: Create a plot visualizing the Iris dataset.\\n\\n## All Task Type:\\n- **eda**: For performing exploratory data analysis\\n- **data_preprocess**: Only for changing value inplace.\\n- **email_login**: For logging to an email.\\n- **feature_engineering**: Only for creating new columns for input data.\\n- **model_train**: Only for training model.\\n- **model_evaluate**: Only for evaluating model.\\n- **stable_diffusion**: Related to text2image, image2image using stable diffusion model.\\n- **image2webpage**: For converting image into webpage code.\\n- **web_scraping**: For scraping data from web pages.\\n- **other**: Any tools not in the defined categories\\n\"}]": {
        "task_type": [
-            "other",
+            "data_preprocess",
            "eda",
            "other"
        ]
    },
-    "[{\"role\": \"system\", \"content\": \"You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**\"}, {\"role\": \"user\", \"content\": \"\\n## User Requirement\\nRun data analysis on sklearn Iris dataset, include a plot\\n## Context\\n\\n## Current Plan\\n[\\n    {\\n        \\\"task_id\\\": \\\"1\\\",\\n        \\\"dependent_task_ids\\\": [],\\n        \\\"instruction\\\": \\\"Load the sklearn Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"other\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    },\\n    {\\n        \\\"task_id\\\": \\\"2\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"1\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Perform exploratory data analysis on the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"eda\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    },\\n    {\\n        \\\"task_id\\\": \\\"3\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"2\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Create a plot visualizing the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"other\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    }\\n]\\n## Current Task\\n{\\\"task_id\\\":\\\"1\\\",\\\"dependent_task_ids\\\":[],\\\"instruction\\\":\\\"Load the sklearn Iris dataset.\\\",\\\"task_type\\\":\\\"other\\\",\\\"code\\\":\\\"\\\",\\\"result\\\":\\\"\\\",\\\"is_success\\\":false,\\\"is_finished\\\":false}\\n\"}, {\"role\": \"user\", \"content\": \"\\n# Instruction\\nWrite complete code for 'Current Task'. And avoid duplicating code from finished tasks, such as repeated import of packages, reading data, etc.\\nSpecifically, \\n\\n# Capabilities\\n- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python Class.\\n- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..\\n\\n# Available Tools (can be empty):\\nEach Class tool is described in JSON format. When you call a tool, import the tool first.\\n{}\\n\\n# Constraints:\\n- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.\\n- Always prioritize using pre-defined tools for the same functionality.\\n\"}]": {
-        "code": "from sklearn.datasets import load_iris\niris_data = load_iris()"
+    "[{\"role\": \"user\", \"content\": \"\\n## User Requirement:\\nLoad the sklearn Iris dataset.\\n\\n## Task\\nRecommend up to five tools from 'Available Tools' that can help solve the 'User Requirement'. \\n\\n## Available Tools:\\n{'FillMissingValue': 'Completing missing values with simple strategies.', 'MinMaxScale': 'Transform features by scaling each feature to a range, which is (0, 1).', 'StandardScale': 'Standardize features by removing the mean and scaling to unit variance.', 'MaxAbsScale': 'Scale each feature by its maximum absolute value.', 'RobustScale': 'Apply the RobustScaler to scale features using statistics that are robust to outliers.', 'OrdinalEncode': 'Encode categorical features as ordinal integers.', 'OneHotEncode': 'Apply one-hot encoding to specified categorical columns, the original columns will be dropped.', 'LabelEncode': 'Apply label encoding to specified categorical columns in-place.'}\\n\\n## Tool Selection and Instructions:\\n- Select tools most relevant to completing the 'User Requirement'.\\n- If you believe that no tools are suitable, indicate with an empty list.\\n- Only list the names of the tools, not the full schema of each tool.\\n- Ensure selected tools are listed in 'Available Tools'.\\n\"}]": {
+        "recommend_tools": []
    },
-    "[{\"role\": \"system\", \"content\": \"You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**\"}, {\"role\": \"user\", \"content\": \"\\n## User Requirement\\nRun data analysis on sklearn Iris dataset, include a plot\\n## Context\\n\\n## Current Plan\\n[\\n    {\\n        \\\"task_id\\\": \\\"1\\\",\\n        \\\"dependent_task_ids\\\": [],\\n        \\\"instruction\\\": \\\"Load the sklearn Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"other\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"a successful run\\\",\\n        \\\"is_success\\\": true,\\n        \\\"is_finished\\\": true\\n    },\\n    {\\n        \\\"task_id\\\": \\\"2\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"1\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Perform exploratory data analysis on the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"eda\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    },\\n    {\\n        \\\"task_id\\\": \\\"3\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"2\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Create a plot visualizing the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"other\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    }\\n]\\n## Current Task\\n{\\\"task_id\\\":\\\"2\\\",\\\"dependent_task_ids\\\":[\\\"1\\\"],\\\"instruction\\\":\\\"Perform exploratory data analysis on the Iris dataset.\\\",\\\"task_type\\\":\\\"eda\\\",\\\"code\\\":\\\"\\\",\\\"result\\\":\\\"\\\",\\\"is_success\\\":false,\\\"is_finished\\\":false}\\n\"}, {\"role\": \"user\", \"content\": \"\\n# Instruction\\nWrite complete code for 'Current Task'. And avoid duplicating code from finished tasks, such as repeated import of packages, reading data, etc.\\nSpecifically, \\n\\n# Capabilities\\n- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python Class.\\n- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..\\n\\n# Available Tools (can be empty):\\nEach Class tool is described in JSON format. When you call a tool, import the tool first.\\n{}\\n\\n# Constraints:\\n- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.\\n- Always prioritize using pre-defined tools for the same functionality.\\n\"}]": {
-        "code": "from sklearn import datasets\nimport pandas as pd\n\n# Load Iris dataset\niris = datasets.load_iris()\ndf_iris = pd.DataFrame(data=iris.data, columns=iris.feature_names)\ndf_iris['target'] = iris.target\n\n# Display basic information about the dataset\nprint(df_iris.info())\n\n# Display statistical summary of the dataset\nprint(df_iris.describe())\n\n# Display the first few rows of the dataset\nprint(df_iris.head())\n\n# Display the distribution of the target variable\ntarget_counts = df_iris['target'].value_counts()\nprint(target_counts)"
+    "[{\"role\": \"system\", \"content\": \"You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**\"}, {\"role\": \"user\", \"content\": \"\\n## User Requirement\\nRun data analysis on sklearn Iris dataset, include a plot\\n## Context\\n\\n## Current Plan\\n[\\n    {\\n        \\\"task_id\\\": \\\"1\\\",\\n        \\\"dependent_task_ids\\\": [],\\n        \\\"instruction\\\": \\\"Load the sklearn Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"data_preprocess\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    },\\n    {\\n        \\\"task_id\\\": \\\"2\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"1\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Perform exploratory data analysis on the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"eda\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    },\\n    {\\n        \\\"task_id\\\": \\\"3\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"2\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Create a plot visualizing the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"other\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    }\\n]\\n## Current Task\\n{\\\"task_id\\\":\\\"1\\\",\\\"dependent_task_ids\\\":[],\\\"instruction\\\":\\\"Load the sklearn Iris dataset.\\\",\\\"task_type\\\":\\\"data_preprocess\\\",\\\"code\\\":\\\"\\\",\\\"result\\\":\\\"\\\",\\\"is_success\\\":false,\\\"is_finished\\\":false}\\n\"}, {\"role\": \"user\", \"content\": \"\\n# Instruction\\nWrite complete code for 'Current Task'. And avoid duplicating code from finished tasks, such as repeated import of packages, reading data, etc.\\nSpecifically, \\nThe current task is about data preprocessing, please note the following:\\n- Monitor data types per column, applying appropriate methods.\\n- Ensure operations are on existing dataset columns.\\n- Avoid writing processed data to files.\\n- Avoid any change to label column, such as standardization, etc.\\n- Prefer alternatives to one-hot encoding for categorical data.\\n- Only encode or scale necessary columns to allow for potential feature-specific engineering tasks (like time_extract, binning, extraction, etc.) later.\\n- Each step do data preprocessing to train, must do same for test separately at the same time.\\n\\n\\n# Capabilities\\n- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python Class.\\n- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..\\n\\n# Available Tools (can be empty):\\nEach Class tool is described in JSON format. When you call a tool, import the tool first.\\n{}\\n\\n# Constraints:\\n- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.\\n- Always prioritize using pre-defined tools for the same functionality.\\n\"}]": {
+        "code": "from sklearn.datasets import load_iris\niris_data = load_iris()\nX, y = iris_data.data, iris_data.target"
    },
-    "[{\"role\": \"system\", \"content\": \"You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**\"}, {\"role\": \"user\", \"content\": \"\\n## User Requirement\\nRun data analysis on sklearn Iris dataset, include a plot\\n## Context\\n\\n## Current Plan\\n[\\n    {\\n        \\\"task_id\\\": \\\"1\\\",\\n        \\\"dependent_task_ids\\\": [],\\n        \\\"instruction\\\": \\\"Load the sklearn Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"other\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"a successful run\\\",\\n        \\\"is_success\\\": true,\\n        \\\"is_finished\\\": true\\n    },\\n    {\\n        \\\"task_id\\\": \\\"2\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"1\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Perform exploratory data analysis on the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"eda\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"a successful run\\\",\\n        \\\"is_success\\\": true,\\n        \\\"is_finished\\\": true\\n    },\\n    {\\n        \\\"task_id\\\": \\\"3\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"2\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Create a plot visualizing the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"other\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    }\\n]\\n## Current Task\\n{\\\"task_id\\\":\\\"3\\\",\\\"dependent_task_ids\\\":[\\\"2\\\"],\\\"instruction\\\":\\\"Create a plot visualizing the Iris dataset.\\\",\\\"task_type\\\":\\\"other\\\",\\\"code\\\":\\\"\\\",\\\"result\\\":\\\"\\\",\\\"is_success\\\":false,\\\"is_finished\\\":false}\\n\"}, {\"role\": \"user\", \"content\": \"\\n# Instruction\\nWrite complete code for 'Current Task'. And avoid duplicating code from finished tasks, such as repeated import of packages, reading data, etc.\\nSpecifically, \\n\\n# Capabilities\\n- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python Class.\\n- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..\\n\\n# Available Tools (can be empty):\\nEach Class tool is described in JSON format. When you call a tool, import the tool first.\\n{}\\n\\n# Constraints:\\n- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.\\n- Always prioritize using pre-defined tools for the same functionality.\\n\"}]": {
-        "code": "from sklearn import datasets\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\n# Load Iris dataset\niris = datasets.load_iris()\niris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)\niris_df['target'] = iris.target\niris_df['target_name'] = iris_df['target'].apply(lambda x: iris.target_names[x])\n\n# Plotting\nfig, ax = plt.subplots(figsize=(12, 8))\nfor target, target_name in zip(iris.target_names, iris.target_names):\n    subset = iris_df[iris_df['target_name'] == target_name]\n    ax.scatter(subset[iris.feature_names[0]], subset[iris.feature_names[1]], label=target_name)\n\nax.set_xlabel(iris.feature_names[0])\nax.set_ylabel(iris.feature_names[1])\nax.legend()\nplt.show()"
+    "[{\"role\": \"system\", \"content\": \"You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**\"}, {\"role\": \"user\", \"content\": \"\\n## User Requirement\\nRun data analysis on sklearn Iris dataset, include a plot\\n## Context\\n\\n## Current Plan\\n[\\n    {\\n        \\\"task_id\\\": \\\"1\\\",\\n        \\\"dependent_task_ids\\\": [],\\n        \\\"instruction\\\": \\\"Load the sklearn Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"data_preprocess\\\",\\n        \\\"code\\\": \\\"from sklearn.datasets import load_iris\\\\niris_data = load_iris()\\\\nX, y = iris_data.data, iris_data.target\\\",\\n        \\\"result\\\": \\\"a successful run\\\",\\n        \\\"is_success\\\": true,\\n        \\\"is_finished\\\": true\\n    },\\n    {\\n        \\\"task_id\\\": \\\"2\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"1\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Perform exploratory data analysis on the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"eda\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    },\\n    {\\n        \\\"task_id\\\": \\\"3\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"2\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Create a plot visualizing the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"other\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    }\\n]\\n## Current Task\\n{\\\"task_id\\\":\\\"2\\\",\\\"dependent_task_ids\\\":[\\\"1\\\"],\\\"instruction\\\":\\\"Perform exploratory data analysis on the Iris dataset.\\\",\\\"task_type\\\":\\\"eda\\\",\\\"code\\\":\\\"\\\",\\\"result\\\":\\\"\\\",\\\"is_success\\\":false,\\\"is_finished\\\":false}\\n\"}, {\"role\": \"user\", \"content\": \"\\n# Instruction\\nWrite complete code for 'Current Task'. And avoid duplicating code from finished tasks, such as repeated import of packages, reading data, etc.\\nSpecifically, \\nThe current task is about exploratory data analysis, please note the following:\\n- Distinguish column types with `select_dtypes` for tailored analysis and visualization, such as correlation.\\n- Remember to `import numpy as np` before using Numpy functions.\\n\\n\\n# Capabilities\\n- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python Class.\\n- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..\\n\\n# Available Tools (can be empty):\\nEach Class tool is described in JSON format. When you call a tool, import the tool first.\\n{}\\n\\n# Constraints:\\n- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.\\n- Always prioritize using pre-defined tools for the same functionality.\\n\"}]": {
+        "code": "import numpy as np\nimport pandas as pd\n\n# Convert the sklearn dataset to a pandas DataFrame for easier manipulation\nfeature_names = iris_data.feature_names\niris_df = pd.DataFrame(X, columns=feature_names)\n\n# Add target column to the DataFrame\niris_df['species'] = pd.Categorical.from_codes(iris_data.target, iris_data.target_names)\n\n# Display basic information about the dataset\niris_info = iris_df.info()\n\n# Display basic statistics about the dataset\niris_description = iris_df.describe()\n\n# Check for missing values\nmissing_values = iris_df.isnull().sum()\n\n# Distinguish column types\nnumerical_cols = iris_df.select_dtypes(include=[np.number]).columns.tolist()\ncategorical_cols = iris_df.select_dtypes(include=['category']).columns.tolist()\n\n# Display the first few rows of the DataFrame\nhead = iris_df.head()\n\n# Output the results\n(iris_info, iris_description, missing_values, numerical_cols, categorical_cols, head)"
    },
-    "[{\"role\": \"user\", \"content\": \"\\n# Background\\nAs a data scientist, you need to help user to achieve their goal [This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: 'tests/data/ml_datasets/titanic/split_train.csv', eval data path: 'tests/data/ml_datasets/titanic/split_eval.csv'.] step-by-step in an continuous Jupyter notebook.\\n\\n## Done Tasks\\n```python\\n\\n```end\\n\\n## Current Task\\nPerform exploratory data analysis on the train dataset to understand the features and target variable.\\n\\n# Latest Data Info\\nLatest data info after previous tasks:\\n\\n\\n# Task\\nWrite complete code for 'Current Task'. And avoid duplicating code from 'Done Tasks', such as repeated import of packages, reading data, etc.\\nSpecifically, \\n\\n# Output Example:\\nwhen current task is \\\"train a lightgbm model on training data\\\", the code can be like:\\n```python\\n# Step 1: check data type and convert to numeric\\nobj_cols = train.select_dtypes(include='object').columns.tolist()\\n\\nfor col in obj_cols:\\n    encoder = LabelEncoder()\\n    train[col] = encoder.fit_transform(train[col].unique().tolist() + ['unknown'])\\n    test[col] = test[col].apply(lambda x: x if x in encoder.classes_ else 'unknown')\\n    test[col] = encoder.transform(test[col])\\n\\n# Step 2: train lightgbm model\\nmodel = LGBMClassifier()\\nmodel.fit(train, y_train)\\n```end\\n\\n# Constraints:\\n- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.\\n\"}]": {
-        "code": "# Perform exploratory data analysis on the train dataset\ndf_train = pd.read_csv('tests/data/ml_datasets/titanic/split_train.csv')\n\n# Display the first few rows of the dataset\ndisplay(df_train.head())\n\n# Summary statistics for numerical features\ndisplay(df_train.describe())\n\n# Summary information about the dataset including the data types and number of non-null values\ndisplay(df_train.info())\n\n# Distribution of the target variable 'Survived'\nsurvival_counts = df_train['Survived'].value_counts()\nprint(\"Survival counts:\\n\", survival_counts)\n\n# Visualizations\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Distribution of the target variable\nsns.countplot(x='Survived', data=df_train)\nplt.title('Distribution of Survival')\nplt.show()\n\n# Correlation matrix heatmap to understand the relationship between features\nplt.figure(figsize=(10, 8))\nsns.heatmap(df_train.corr(), annot=True, fmt='.2f')\nplt.title('Correlation Matrix')\nplt.show()\n\n# Pairplot to visualize the pairwise relationships between features\nsns.pairplot(df_train, hue='Survived')\nplt.title('Pairplot of Features')\nplt.show()"
+    "[{\"role\": \"system\", \"content\": \"You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**\"}, {\"role\": \"user\", \"content\": \"\\n## User Requirement\\nRun data analysis on sklearn Iris dataset, include a plot\\n## Context\\n\\n## Current Plan\\n[\\n    {\\n        \\\"task_id\\\": \\\"1\\\",\\n        \\\"dependent_task_ids\\\": [],\\n        \\\"instruction\\\": \\\"Load the sklearn Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"data_preprocess\\\",\\n        \\\"code\\\": \\\"from sklearn.datasets import load_iris\\\\niris_data = load_iris()\\\\nX, y = iris_data.data, iris_data.target\\\",\\n        \\\"result\\\": \\\"a successful run\\\",\\n        \\\"is_success\\\": true,\\n        \\\"is_finished\\\": true\\n    },\\n    {\\n        \\\"task_id\\\": \\\"2\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"1\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Perform exploratory data analysis on the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"eda\\\",\\n        \\\"code\\\": \\\"import numpy as np\\\\nimport pandas as pd\\\\n\\\\n# Convert the sklearn dataset to a pandas DataFrame for easier manipulation\\\\nfeature_names = iris_data.feature_names\\\\niris_df = pd.DataFrame(X, columns=feature_names)\\\\n\\\\n# Add target column to the DataFrame\\\\niris_df['species'] = pd.Categorical.from_codes(iris_data.target, iris_data.target_names)\\\\n\\\\n# Display basic information about the dataset\\\\niris_info = iris_df.info()\\\\n\\\\n# Display basic statistics about the dataset\\\\niris_description = iris_df.describe()\\\\n\\\\n# Check for missing values\\\\nmissing_values = iris_df.isnull().sum()\\\\n\\\\n# Distinguish column types\\\\nnumerical_cols = iris_df.select_dtypes(include=[np.number]).columns.tolist()\\\\ncategorical_cols = iris_df.select_dtypes(include=['category']).columns.tolist()\\\\n\\\\n# Display the first few rows of the DataFrame\\\\nhead = iris_df.head()\\\\n\\\\n# Output the results\\\\n(iris_info, iris_description, missing_values, numerical_cols, categorical_cols, head)\\\",\\n        \\\"result\\\": \\\"a successful run\\\",\\n        \\\"is_success\\\": true,\\n        \\\"is_finished\\\": true\\n    },\\n    {\\n        \\\"task_id\\\": \\\"3\\\",\\n        \\\"dependent_task_ids\\\": [\\n            \\\"2\\\"\\n        ],\\n        \\\"instruction\\\": \\\"Create a plot visualizing the Iris dataset.\\\",\\n        \\\"task_type\\\": \\\"other\\\",\\n        \\\"code\\\": \\\"\\\",\\n        \\\"result\\\": \\\"\\\",\\n        \\\"is_success\\\": false,\\n        \\\"is_finished\\\": false\\n    }\\n]\\n## Current Task\\n{\\\"task_id\\\":\\\"3\\\",\\\"dependent_task_ids\\\":[\\\"2\\\"],\\\"instruction\\\":\\\"Create a plot visualizing the Iris dataset.\\\",\\\"task_type\\\":\\\"other\\\",\\\"code\\\":\\\"\\\",\\\"result\\\":\\\"\\\",\\\"is_success\\\":false,\\\"is_finished\\\":false}\\n\"}, {\"role\": \"user\", \"content\": \"\\n# Instruction\\nWrite complete code for 'Current Task'. And avoid duplicating code from finished tasks, such as repeated import of packages, reading data, etc.\\nSpecifically, \\n\\n# Capabilities\\n- You can utilize pre-defined tools in any code lines from 'Available Tools' in the form of Python Class.\\n- You can freely combine the use of any other public packages, like sklearn, numpy, pandas, etc..\\n\\n# Available Tools (can be empty):\\nEach Class tool is described in JSON format. When you call a tool, import the tool first.\\n{}\\n\\n# Constraints:\\n- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.\\n- Always prioritize using pre-defined tools for the same functionality.\\n\"}]": {
+        "code": "import matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Set the style of seaborn\nsns.set(style='whitegrid')\n\n# Create a pairplot to visualize the relationships between the features\npairplot = sns.pairplot(iris_df, hue='species')\nplt.show()"
+    },
+    "[{\"role\": \"user\", \"content\": \"\\n# Background\\nAs a data scientist, you need to help user to achieve their goal [This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: 'tests/data/ml_datasets/titanic/split_train.csv', eval data path: 'tests/data/ml_datasets/titanic/split_eval.csv'.] step-by-step in an continuous Jupyter notebook.\\n\\n## Done Tasks\\n```python\\n\\n```end\\n\\n## Current Task\\nPerform exploratory data analysis on the train dataset to understand the features and target variable.\\n\\n# Latest Data Info\\nLatest data info after previous tasks:\\n\\n\\n# Task\\nWrite complete code for 'Current Task'. And avoid duplicating code from 'Done Tasks', such as repeated import of packages, reading data, etc.\\nSpecifically, \\nThe current task is about exploratory data analysis, please note the following:\\n- Distinguish column types with `select_dtypes` for tailored analysis and visualization, such as correlation.\\n- Remember to `import numpy as np` before using Numpy functions.\\n\\n\\n# Output Example:\\nwhen current task is \\\"train a lightgbm model on training data\\\", the code can be like:\\n```python\\n# Step 1: check data type and convert to numeric\\nobj_cols = train.select_dtypes(include='object').columns.tolist()\\n\\nfor col in obj_cols:\\n    encoder = LabelEncoder()\\n    train[col] = encoder.fit_transform(train[col].unique().tolist() + ['unknown'])\\n    test[col] = test[col].apply(lambda x: x if x in encoder.classes_ else 'unknown')\\n    test[col] = encoder.transform(test[col])\\n\\n# Step 2: train lightgbm model\\nmodel = LGBMClassifier()\\nmodel.fit(train, y_train)\\n```end\\n\\n# Constraints:\\n- Ensure the output new code is executable in the same Jupyter notebook with previous tasks code have been executed.\\n\"}]": {
+        "code": "# Perform exploratory data analysis on the train dataset\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Read the train dataset\ntrain_data = pd.read_csv('tests/data/ml_datasets/titanic/split_train.csv')\n\n# Display the first few rows of the dataset\ndisplay(train_data.head())\n\n# Summary statistics for numerical features\nprint(train_data.describe())\n\n# Summary statistics for categorical features\nprint(train_data.describe(include=['O']))\n\n# Check for missing values\nprint(train_data.isnull().sum())\n\n# Distribution of the target variable\nsns.countplot(x='Survived', data=train_data)\nplt.title('Distribution of Survival on the Titanic')\nplt.show()\n\n# Correlation matrix for numerical features\nnumerical_features = train_data.select_dtypes(include=[np.number])\ncorrelation_matrix = numerical_features.corr()\nplt.figure(figsize=(10, 8))\nsns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)\nplt.title('Correlation Matrix for Numerical Features')\nplt.show()\n\n# Pairplot for selected features\nselected_features = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']\nsns.pairplot(train_data[selected_features], hue='Survived')\nplt.show()\n\n# Boxplot for categorical features vs Survived\nfor column in train_data.select_dtypes(include=['O']).columns:\n    if column != 'Survived':\n        plt.figure(figsize=(10, 5))\n        sns.boxplot(x='Survived', y=column, data=train_data)\n        plt.title(f'Survived vs {column}')\n        plt.show()\n"
    },
    "[{\"role\": \"system\", \"content\": \"You are an AI Python assistant. You will be given your previous implementation code of a task, runtime error results, and a hint to change the implementation appropriately. Write your full implementation \"}, {\"role\": \"user\", \"content\": \"\\nHere is an example for you.\\n\\nExample 1:\\n[previous impl]:\\n```python\\ndef add(a: int, b: int) -> int:\\n   \\\"\\\"\\\"\\n   Given integers a and b, return the total value of a and b.\\n   \\\"\\\"\\\"\\n   return a - b\\n```\\n\\n[runtime Error]:\\nTested passed:\\n\\nTests failed:\\nassert add(1, 2) == 3 # output: -1\\nassert add(1, 2) == 4 # output: -1\\n\\n[reflection on previous impl]:\\nThe implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from `-` to `+` in the return statement. This will ensure that the function returns the correct output for the given input.\\n\\n[improved impl]:\\n```python\\ndef add(a: int, b: int) -> int:\\n   \\\"\\\"\\\"\\n   Given integers a and b, return the total value of a and b.\\n   \\\"\\\"\\\"\\n   return a + b\\n```\\n\\n[context]\\nSolve the problem in Python:\\ndef sort_array(arr):\\n    \\\"\\\"\\\"\\n    In this Kata, you have to sort an array of non-negative integers according to\\n    number of ones in their binary representation in ascending order.\\n    For similar number of ones, sort based on decimal value.\\n\\n    It must be implemented like this:\\n    >>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]\\n    >>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]\\n    >>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]\\n    \\\"\\\"\\\"\\n\\n\\n[previous impl]\\n\\ndef sort_array(arr):\\n    # Helper function to count the number of ones in the binary representation\\n    def count_ones(n):\\n        return bin(n).count('1')\\n    \\n    # Sort the array using a custom key function\\n    # The key function returns a tuple (number of ones, value) for each element\\n    # This ensures that if two elements have the same number of ones, they are sorted by their value\\n    sorted_arr = sorted(arr, key=lambda x: (count_ones(x), x))\\n    \\n    return sorted_arr\\n```\\n\\n[runtime Error]\\n[user: Tested passed:\\n\\nTests failed:\\nassert sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5] # output: [1, 2, 4, 3, 5]\\n]\\n\\nAnalysis the error step by step, provide me improve method and code. Remember to follow [context] requirement. Don't forget write code for steps behind the error step.\\n[reflection on previous impl]:\\nxxx\\n\"}]": {
        "reflection": "The implementation failed the test case where the input array is [1, 5, 2, 3, 4]. The issue arises because the code does not handle the sorting of elements with the same number of ones in their binary representation correctly. According to the context, for similar number of ones, the elements should be sorted based on their decimal value in ascending order. However, the output [1, 2, 4, 3, 5] shows that 3 and 4 are not in the correct order, indicating that the sorting by decimal value is not functioning as intended. To fix this issue, we should ensure that the sorting is stable, so that when two elements have the same number of ones, they maintain their relative order if their decimal values are already in ascending order.",
--- a/tests/metagpt/actions/mi/test_ask_review.py
+++ b/tests/metagpt/actions/mi/test_ask_review.py
@ -1,6 +1,6 @@
 import pytest

-from metagpt.actions.ci.ask_review import AskReview
+from metagpt.actions.mi.ask_review import AskReview


@pytest.mark.asyncio
--- a/tests/metagpt/actions/mi/test_debug_code.py
+++ b/tests/metagpt/actions/mi/test_debug_code.py
@ -5,7 +5,7 @@

 import pytest

-from metagpt.actions.ci.debug_code import DebugCode
+from metagpt.actions.mi.debug_code import DebugCode
 from metagpt.schema import Message

 ErrorStr = """Tested passed:
--- a/tests/metagpt/actions/mi/test_execute_nb_code.py
+++ b/tests/metagpt/actions/mi/test_execute_nb_code.py
@ -1,6 +1,6 @@
 import pytest

-from metagpt.actions.ci.execute_nb_code import ExecuteNbCode, truncate
+from metagpt.actions.mi.execute_nb_code import ExecuteNbCode, truncate


@pytest.mark.asyncio
--- a/tests/metagpt/actions/mi/test_ml_action.py
+++ b/tests/metagpt/actions/mi/test_ml_action.py
@ -1,6 +1,6 @@
 import pytest

-from metagpt.actions.ci.ml_action import WriteCodeWithToolsML
+from metagpt.actions.mi.ml_action import WriteCodeWithToolsML
 from metagpt.schema import Plan, Task


--- a/tests/metagpt/actions/mi/test_write_analysis_code.py
+++ b/tests/metagpt/actions/mi/test_write_analysis_code.py
@ -2,8 +2,8 @@ import asyncio

 import pytest

-from metagpt.actions.ci.execute_nb_code import ExecuteNbCode
-from metagpt.actions.ci.write_analysis_code import (
+from metagpt.actions.mi.execute_nb_code import ExecuteNbCode
+from metagpt.actions.mi.write_analysis_code import (
    WriteCodeWithoutTools,
    WriteCodeWithTools,
 )
--- a/tests/metagpt/actions/mi/test_write_plan.py
+++ b/tests/metagpt/actions/mi/test_write_plan.py
@ -1,6 +1,6 @@
 import pytest

-from metagpt.actions.ci.write_plan import (
+from metagpt.actions.mi.write_plan import (
    Plan,
    Task,
    WritePlan,
--- a/tests/metagpt/actions/test_rebuild_class_view.py
+++ b/tests/metagpt/actions/test_rebuild_class_view.py
@ -14,6 +14,7 @@ from metagpt.actions.rebuild_class_view import RebuildClassView
 from metagpt.llm import LLM


+@pytest.mark.skip
@pytest.mark.asyncio
 async def test_rebuild(context):
    action = RebuildClassView(
--- a/tests/metagpt/document_store/test_faiss_store.py
+++ b/tests/metagpt/document_store/test_faiss_store.py
@ -6,6 +6,9 @@
@File    : test_faiss_store.py
 """

+from typing import Optional
+
+import numpy as np
 import pytest

 from metagpt.const import EXAMPLE_PATH
@ -14,8 +17,17 @@ from metagpt.logs import logger
 from metagpt.roles import Sales


+def mock_openai_embed_documents(self, texts: list[str], chunk_size: Optional[int] = 0) -> list[list[float]]:
+    num = len(texts)
+    embeds = np.random.randint(1, 100, size=(num, 1536))  # 1536: openai embedding dim
+    embeds = (embeds - embeds.mean(axis=0)) / (embeds.std(axis=0))
+    return embeds
+
+
@pytest.mark.asyncio
-async def test_search_json():
+async def test_search_json(mocker):
+    mocker.patch("langchain_community.embeddings.openai.OpenAIEmbeddings.embed_documents", mock_openai_embed_documents)
+
    store = FaissStore(EXAMPLE_PATH / "example.json")
    role = Sales(profile="Sales", store=store)
    query = "Which facial cleanser is good for oily skin?"
@ -24,7 +36,9 @@ async def test_search_json():


@pytest.mark.asyncio
-async def test_search_xlsx():
+async def test_search_xlsx(mocker):
+    mocker.patch("langchain_community.embeddings.openai.OpenAIEmbeddings.embed_documents", mock_openai_embed_documents)
+
    store = FaissStore(EXAMPLE_PATH / "example.xlsx")
    role = Sales(profile="Sales", store=store)
    query = "Which facial cleanser is good for oily skin?"
@ -33,7 +47,9 @@ async def test_search_xlsx():


@pytest.mark.asyncio
-async def test_write():
+async def test_write(mocker):
+    mocker.patch("langchain_community.embeddings.openai.OpenAIEmbeddings.embed_documents", mock_openai_embed_documents)
+
    store = FaissStore(EXAMPLE_PATH / "example.xlsx", meta_col="Answer", content_col="Question")
    _faiss_store = store.write()
    assert _faiss_store.docstore
--- a/tests/metagpt/memory/mock_text_embed.py
+++ b/tests/metagpt/memory/mock_text_embed.py
@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc   :
+
+from typing import Optional
+
+import numpy as np
+
+dim = 1536  # openai embedding dim
+
+text_embed_arr = [
+    {"text": "Write a cli snake game", "embed": np.zeros(shape=[1, dim])},  # mock data, same as below
+    {"text": "Write a game of cli snake", "embed": np.zeros(shape=[1, dim])},
+    {"text": "Write a 2048 web game", "embed": np.ones(shape=[1, dim])},
+    {"text": "Write a Battle City", "embed": np.ones(shape=[1, dim])},
+    {
+        "text": "The user has requested the creation of a command-line interface (CLI) snake game",
+        "embed": np.zeros(shape=[1, dim]),
+    },
+    {"text": "The request is command-line interface (CLI) snake game", "embed": np.zeros(shape=[1, dim])},
+    {
+        "text": "Incorporate basic features of a snake game such as scoring and increasing difficulty",
+        "embed": np.ones(shape=[1, dim]),
+    },
+]
+
+text_idx_dict = {item["text"]: idx for idx, item in enumerate(text_embed_arr)}
+
+
+def mock_openai_embed_documents(self, texts: list[str], chunk_size: Optional[int] = 0) -> list[list[float]]:
+    idx = text_idx_dict.get(texts[0])
+    embed = text_embed_arr[idx].get("embed")
+    return embed
--- a/tests/metagpt/memory/test_longterm_memory.py
+++ b/tests/metagpt/memory/test_longterm_memory.py
@ -4,20 +4,22 @@
@Desc   : unittest of `metagpt/memory/longterm_memory.py`
 """

-import os

 import pytest

 from metagpt.actions import UserRequirement
-from metagpt.config2 import config
 from metagpt.memory.longterm_memory import LongTermMemory
 from metagpt.roles.role import RoleContext
 from metagpt.schema import Message
-
-os.environ.setdefault("OPENAI_API_KEY", config.get_openai_llm().api_key)
+from tests.metagpt.memory.mock_text_embed import (
+    mock_openai_embed_documents,
+    text_embed_arr,
+)


-def test_ltm_search():
+def test_ltm_search(mocker):
+    mocker.patch("langchain_community.embeddings.openai.OpenAIEmbeddings.embed_documents", mock_openai_embed_documents)
+
    role_id = "UTUserLtm(Product Manager)"
    from metagpt.environment import Environment

@ -27,20 +29,20 @@ def test_ltm_search():
    ltm = LongTermMemory()
    ltm.recover_memory(role_id, rc)

-    idea = "Write a cli snake game"
+    idea = text_embed_arr[0].get("text", "Write a cli snake game")
    message = Message(role="User", content=idea, cause_by=UserRequirement)
    news = ltm.find_news([message])
    assert len(news) == 1
    ltm.add(message)

-    sim_idea = "Write a game of cli snake"
+    sim_idea = text_embed_arr[1].get("text", "Write a game of cli snake")

    sim_message = Message(role="User", content=sim_idea, cause_by=UserRequirement)
    news = ltm.find_news([sim_message])
    assert len(news) == 0
    ltm.add(sim_message)

-    new_idea = "Write a 2048 web game"
+    new_idea = text_embed_arr[2].get("text", "Write a 2048 web game")
    new_message = Message(role="User", content=new_idea, cause_by=UserRequirement)
    news = ltm.find_news([new_message])
    assert len(news) == 1
@ -56,7 +58,7 @@ def test_ltm_search():
    news = ltm_new.find_news([sim_message])
    assert len(news) == 0

-    new_idea = "Write a Battle City"
+    new_idea = text_embed_arr[3].get("text", "Write a Battle City")
    new_message = Message(role="User", content=new_idea, cause_by=UserRequirement)
    news = ltm_new.find_news([new_message])
    assert len(news) == 1
--- a/tests/metagpt/memory/test_memory_storage.py
+++ b/tests/metagpt/memory/test_memory_storage.py
@ -4,23 +4,25 @@
@Desc   : the unittests of metagpt/memory/memory_storage.py
 """

-import os
 import shutil
 from pathlib import Path
 from typing import List

 from metagpt.actions import UserRequirement, WritePRD
 from metagpt.actions.action_node import ActionNode
-from metagpt.config2 import config
 from metagpt.const import DATA_PATH
 from metagpt.memory.memory_storage import MemoryStorage
 from metagpt.schema import Message
-
-os.environ.setdefault("OPENAI_API_KEY", config.get_openai_llm().api_key)
+from tests.metagpt.memory.mock_text_embed import (
+    mock_openai_embed_documents,
+    text_embed_arr,
+)


-def test_idea_message():
-    idea = "Write a cli snake game"
+def test_idea_message(mocker):
+    mocker.patch("langchain_community.embeddings.openai.OpenAIEmbeddings.embed_documents", mock_openai_embed_documents)
+
+    idea = text_embed_arr[0].get("text", "Write a cli snake game")
    role_id = "UTUser1(Product Manager)"
    message = Message(role="User", content=idea, cause_by=UserRequirement)

@ -33,12 +35,12 @@ def test_idea_message():
    memory_storage.add(message)
    assert memory_storage.is_initialized is True

-    sim_idea = "Write a game of cli snake"
+    sim_idea = text_embed_arr[1].get("text", "Write a game of cli snake")
    sim_message = Message(role="User", content=sim_idea, cause_by=UserRequirement)
    new_messages = memory_storage.search_dissimilar(sim_message)
    assert len(new_messages) == 0  # similar, return []

-    new_idea = "Write a 2048 web game"
+    new_idea = text_embed_arr[2].get("text", "Write a 2048 web game")
    new_message = Message(role="User", content=new_idea, cause_by=UserRequirement)
    new_messages = memory_storage.search_dissimilar(new_message)
    assert new_messages[0].content == message.content
@ -47,13 +49,17 @@ def test_idea_message():
    assert memory_storage.is_initialized is False


-def test_actionout_message():
+def test_actionout_message(mocker):
+    mocker.patch("langchain_community.embeddings.openai.OpenAIEmbeddings.embed_documents", mock_openai_embed_documents)
+
    out_mapping = {"field1": (str, ...), "field2": (List[str], ...)}
    out_data = {"field1": "field1 value", "field2": ["field2 value1", "field2 value2"]}
    ic_obj = ActionNode.create_model_class("prd", out_mapping)

    role_id = "UTUser2(Architect)"
-    content = "The user has requested the creation of a command-line interface (CLI) snake game"
+    content = text_embed_arr[4].get(
+        "text", "The user has requested the creation of a command-line interface (CLI) snake game"
+    )
    message = Message(
        content=content, instruct_content=ic_obj(**out_data), role="user", cause_by=WritePRD
    )  # WritePRD as test action
@ -67,12 +73,14 @@ def test_actionout_message():
    memory_storage.add(message)
    assert memory_storage.is_initialized is True

-    sim_conent = "The request is command-line interface (CLI) snake game"
+    sim_conent = text_embed_arr[5].get("text", "The request is command-line interface (CLI) snake game")
    sim_message = Message(content=sim_conent, instruct_content=ic_obj(**out_data), role="user", cause_by=WritePRD)
    new_messages = memory_storage.search_dissimilar(sim_message)
    assert len(new_messages) == 0  # similar, return []

-    new_conent = "Incorporate basic features of a snake game such as scoring and increasing difficulty"
+    new_conent = text_embed_arr[6].get(
+        "text", "Incorporate basic features of a snake game such as scoring and increasing difficulty"
+    )
    new_message = Message(content=new_conent, instruct_content=ic_obj(**out_data), role="user", cause_by=WritePRD)
    new_messages = memory_storage.search_dissimilar(new_message)
    assert new_messages[0].content == message.content
--- a/tests/metagpt/provider/mock_llm_config.py
+++ b/tests/metagpt/provider/mock_llm_config.py
@ -42,3 +42,17 @@ mock_llm_config_zhipu = LLMConfig(
    model="mock_zhipu_model",
    proxy="http://localhost:8080",
 )
+
+
+mock_llm_config_spark = LLMConfig(
+    api_type="spark",
+    app_id="xxx",
+    api_key="xxx",
+    api_secret="xxx",
+    domain="generalv2",
+    base_url="wss://spark-api.xf-yun.com/v3.1/chat",
+)
+
+mock_llm_config_qianfan = LLMConfig(api_type="qianfan", access_key="xxx", secret_key="xxx", model="ERNIE-Bot-turbo")
+
+mock_llm_config_dashscope = LLMConfig(api_type="dashscope", api_key="xxx", model="qwen-max")
--- a/tests/metagpt/provider/req_resp_const.py
+++ b/tests/metagpt/provider/req_resp_const.py
@ -0,0 +1,145 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc   : default request & response data for provider unittest
+
+
+from dashscope.api_entities.dashscope_response import (
+    DashScopeAPIResponse,
+    GenerationOutput,
+    GenerationResponse,
+    GenerationUsage,
+)
+from openai.types.chat.chat_completion import (
+    ChatCompletion,
+    ChatCompletionMessage,
+    Choice,
+)
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
+from openai.types.chat.chat_completion_chunk import Choice as AChoice
+from openai.types.chat.chat_completion_chunk import ChoiceDelta
+from openai.types.completion_usage import CompletionUsage
+from qianfan.resources.typing import QfResponse
+
+from metagpt.provider.base_llm import BaseLLM
+
+prompt = "who are you?"
+messages = [{"role": "user", "content": prompt}]
+
+resp_cont_tmpl = "I'm {name}"
+default_resp_cont = resp_cont_tmpl.format(name="GPT")
+
+
+# part of whole ChatCompletion of openai like structure
+def get_part_chat_completion(name: str) -> dict:
+    part_chat_completion = {
+        "choices": [
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": resp_cont_tmpl.format(name=name),
+                },
+                "finish_reason": "stop",
+            }
+        ],
+        "usage": {"completion_tokens": 22, "prompt_tokens": 19, "total_tokens": 41},
+    }
+    return part_chat_completion
+
+
+def get_openai_chat_completion(name: str) -> ChatCompletion:
+    openai_chat_completion = ChatCompletion(
+        id="cmpl-a6652c1bb181caae8dd19ad8",
+        model="xx/xxx",
+        object="chat.completion",
+        created=1703300855,
+        choices=[
+            Choice(
+                finish_reason="stop",
+                index=0,
+                message=ChatCompletionMessage(role="assistant", content=resp_cont_tmpl.format(name=name)),
+                logprobs=None,
+            )
+        ],
+        usage=CompletionUsage(completion_tokens=110, prompt_tokens=92, total_tokens=202),
+    )
+    return openai_chat_completion
+
+
+def get_openai_chat_completion_chunk(name: str, usage_as_dict: bool = False) -> ChatCompletionChunk:
+    usage = CompletionUsage(completion_tokens=110, prompt_tokens=92, total_tokens=202)
+    usage = usage if not usage_as_dict else usage.model_dump()
+    openai_chat_completion_chunk = ChatCompletionChunk(
+        id="cmpl-a6652c1bb181caae8dd19ad8",
+        model="xx/xxx",
+        object="chat.completion.chunk",
+        created=1703300855,
+        choices=[
+            AChoice(
+                delta=ChoiceDelta(role="assistant", content=resp_cont_tmpl.format(name=name)),
+                finish_reason="stop",
+                index=0,
+                logprobs=None,
+            )
+        ],
+        usage=usage,
+    )
+    return openai_chat_completion_chunk
+
+
+# For gemini
+gemini_messages = [{"role": "user", "parts": prompt}]
+
+
+# For QianFan
+qf_jsonbody_dict = {
+    "id": "as-4v1h587fyv",
+    "object": "chat.completion",
+    "created": 1695021339,
+    "result": "",
+    "is_truncated": False,
+    "need_clear_history": False,
+    "usage": {"prompt_tokens": 7, "completion_tokens": 15, "total_tokens": 22},
+}
+
+
+def get_qianfan_response(name: str) -> QfResponse:
+    qf_jsonbody_dict["result"] = resp_cont_tmpl.format(name=name)
+    return QfResponse(code=200, body=qf_jsonbody_dict)
+
+
+# For DashScope
+def get_dashscope_response(name: str) -> GenerationResponse:
+    return GenerationResponse.from_api_response(
+        DashScopeAPIResponse(
+            status_code=200,
+            output=GenerationOutput(
+                **{
+                    "text": "",
+                    "finish_reason": "",
+                    "choices": [
+                        {
+                            "finish_reason": "stop",
+                            "message": {"role": "assistant", "content": resp_cont_tmpl.format(name=name)},
+                        }
+                    ],
+                }
+            ),
+            usage=GenerationUsage(**{"input_tokens": 12, "output_tokens": 98, "total_tokens": 110}),
+        )
+    )
+
+
+# For llm general chat functions call
+async def llm_general_chat_funcs_test(llm: BaseLLM, prompt: str, messages: list[dict], resp_cont: str):
+    resp = await llm.aask(prompt, stream=False)
+    assert resp == resp_cont
+
+    resp = await llm.aask(prompt)
+    assert resp == resp_cont
+
+    resp = await llm.acompletion_text(messages, stream=False)
+    assert resp == resp_cont
+
+    resp = await llm.acompletion_text(messages, stream=True)
+    assert resp == resp_cont
--- a/tests/metagpt/provider/test_anthropic_api.py
+++ b/tests/metagpt/provider/test_anthropic_api.py
@ -8,25 +8,25 @@ from anthropic.resources.completions import Completion

 from metagpt.provider.anthropic_api import Claude2
 from tests.metagpt.provider.mock_llm_config import mock_llm_config
+from tests.metagpt.provider.req_resp_const import prompt, resp_cont_tmpl

-prompt = "who are you"
-resp = "I'am Claude2"
+resp_cont = resp_cont_tmpl.format(name="Claude")


 def mock_anthropic_completions_create(self, model: str, prompt: str, max_tokens_to_sample: int) -> Completion:
-    return Completion(id="xx", completion=resp, model="claude-2", stop_reason="stop_sequence", type="completion")
+    return Completion(id="xx", completion=resp_cont, model="claude-2", stop_reason="stop_sequence", type="completion")


 async def mock_anthropic_acompletions_create(self, model: str, prompt: str, max_tokens_to_sample: int) -> Completion:
-    return Completion(id="xx", completion=resp, model="claude-2", stop_reason="stop_sequence", type="completion")
+    return Completion(id="xx", completion=resp_cont, model="claude-2", stop_reason="stop_sequence", type="completion")


 def test_claude2_ask(mocker):
    mocker.patch("anthropic.resources.completions.Completions.create", mock_anthropic_completions_create)
-    assert resp == Claude2(mock_llm_config).ask(prompt)
+    assert resp_cont == Claude2(mock_llm_config).ask(prompt)


@pytest.mark.asyncio
 async def test_claude2_aask(mocker):
    mocker.patch("anthropic.resources.completions.AsyncCompletions.create", mock_anthropic_acompletions_create)
-    assert resp == await Claude2(mock_llm_config).aask(prompt)
+    assert resp_cont == await Claude2(mock_llm_config).aask(prompt)
--- a/tests/metagpt/provider/test_base_llm.py
+++ b/tests/metagpt/provider/test_base_llm.py
@ -11,21 +11,13 @@ import pytest
 from metagpt.configs.llm_config import LLMConfig
 from metagpt.provider.base_llm import BaseLLM
 from metagpt.schema import Message
+from tests.metagpt.provider.req_resp_const import (
+    default_resp_cont,
+    get_part_chat_completion,
+    prompt,
+)

-default_chat_resp = {
-    "choices": [
-        {
-            "index": 0,
-            "message": {
-                "role": "assistant",
-                "content": "I'am GPT",
-            },
-            "finish_reason": "stop",
-        }
-    ]
-}
-prompt_msg = "who are you"
-resp_content = default_chat_resp["choices"][0]["message"]["content"]
+name = "GPT"


 class MockBaseLLM(BaseLLM):
@ -33,16 +25,13 @@ class MockBaseLLM(BaseLLM):
        pass

    def completion(self, messages: list[dict], timeout=3):
-        return default_chat_resp
+        return get_part_chat_completion(name)

    async def acompletion(self, messages: list[dict], timeout=3):
-        return default_chat_resp
+        return get_part_chat_completion(name)

    async def acompletion_text(self, messages: list[dict], stream=False, timeout=3) -> str:
-        return resp_content
-
-    async def close(self):
-        return default_chat_resp
+        return default_resp_cont


 def test_base_llm():
@ -86,25 +75,25 @@ def test_base_llm():
    choice_text = base_llm.get_choice_text(openai_funccall_resp)
    assert choice_text == openai_funccall_resp["choices"][0]["message"]["content"]

-    # resp = base_llm.ask(prompt_msg)
-    # assert resp == resp_content
+    # resp = base_llm.ask(prompt)
+    # assert resp == default_resp_cont

-    # resp = base_llm.ask_batch([prompt_msg])
-    # assert resp == resp_content
+    # resp = base_llm.ask_batch([prompt])
+    # assert resp == default_resp_cont

-    # resp = base_llm.ask_code([prompt_msg])
-    # assert resp == resp_content
+    # resp = base_llm.ask_code([prompt])
+    # assert resp == default_resp_cont


@pytest.mark.asyncio
 async def test_async_base_llm():
    base_llm = MockBaseLLM()

-    resp = await base_llm.aask(prompt_msg)
-    assert resp == resp_content
+    resp = await base_llm.aask(prompt)
+    assert resp == default_resp_cont

-    resp = await base_llm.aask_batch([prompt_msg])
-    assert resp == resp_content
+    resp = await base_llm.aask_batch([prompt])
+    assert resp == default_resp_cont

-    # resp = await base_llm.aask_code([prompt_msg])
-    # assert resp == resp_content
+    # resp = await base_llm.aask_code([prompt])
+    # assert resp == default_resp_cont
--- a/tests/metagpt/provider/test_dashscope_api.py
+++ b/tests/metagpt/provider/test_dashscope_api.py
@ -0,0 +1,73 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc   : the unittest of DashScopeLLM
+
+from typing import AsyncGenerator, Union
+
+import pytest
+from dashscope.api_entities.dashscope_response import GenerationResponse
+
+from metagpt.provider.dashscope_api import DashScopeLLM
+from tests.metagpt.provider.mock_llm_config import mock_llm_config_dashscope
+from tests.metagpt.provider.req_resp_const import (
+    get_dashscope_response,
+    llm_general_chat_funcs_test,
+    messages,
+    prompt,
+    resp_cont_tmpl,
+)
+
+name = "qwen-max"
+resp_cont = resp_cont_tmpl.format(name=name)
+
+
+@classmethod
+def mock_dashscope_call(
+    cls,
+    messages: list[dict],
+    model: str,
+    api_key: str,
+    result_format: str,
+    incremental_output: bool = True,
+    stream: bool = False,
+) -> GenerationResponse:
+    return get_dashscope_response(name)
+
+
+@classmethod
+async def mock_dashscope_acall(
+    cls,
+    messages: list[dict],
+    model: str,
+    api_key: str,
+    result_format: str,
+    incremental_output: bool = True,
+    stream: bool = False,
+) -> Union[AsyncGenerator[GenerationResponse, None], GenerationResponse]:
+    resps = [get_dashscope_response(name)]
+
+    if stream:
+
+        async def aresp_iterator(resps: list[GenerationResponse]):
+            for resp in resps:
+                yield resp
+
+        return aresp_iterator(resps)
+    else:
+        return resps[0]
+
+
+@pytest.mark.asyncio
+async def test_dashscope_acompletion(mocker):
+    mocker.patch("dashscope.aigc.generation.Generation.call", mock_dashscope_call)
+    mocker.patch("metagpt.provider.dashscope_api.AGeneration.acall", mock_dashscope_acall)
+
+    dashscope_llm = DashScopeLLM(mock_llm_config_dashscope)
+
+    resp = dashscope_llm.completion(messages)
+    assert resp.choices[0]["message"]["content"] == resp_cont
+
+    resp = await dashscope_llm.acompletion(messages)
+    assert resp.choices[0]["message"]["content"] == resp_cont
+
+    await llm_general_chat_funcs_test(dashscope_llm, prompt, messages, resp_cont)
--- a/tests/metagpt/provider/test_fireworks_llm.py
+++ b/tests/metagpt/provider/test_fireworks_llm.py
@ -1,114 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Desc   : the unittest of fireworks api
-
-import pytest
-from openai.types.chat.chat_completion import (
-    ChatCompletion,
-    ChatCompletionMessage,
-    Choice,
-)
-from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
-from openai.types.chat.chat_completion_chunk import Choice as AChoice
-from openai.types.chat.chat_completion_chunk import ChoiceDelta
-from openai.types.completion_usage import CompletionUsage
-
-from metagpt.provider.fireworks_api import (
-    MODEL_GRADE_TOKEN_COSTS,
-    FireworksCostManager,
-    FireworksLLM,
-)
-from metagpt.utils.cost_manager import Costs
-from tests.metagpt.provider.mock_llm_config import mock_llm_config
-
-resp_content = "I'm fireworks"
-default_resp = ChatCompletion(
-    id="cmpl-a6652c1bb181caae8dd19ad8",
-    model="accounts/fireworks/models/llama-v2-13b-chat",
-    object="chat.completion",
-    created=1703300855,
-    choices=[
-        Choice(
-            finish_reason="stop",
-            index=0,
-            message=ChatCompletionMessage(role="assistant", content=resp_content),
-            logprobs=None,
-        )
-    ],
-    usage=CompletionUsage(completion_tokens=110, prompt_tokens=92, total_tokens=202),
-)
-
-default_resp_chunk = ChatCompletionChunk(
-    id=default_resp.id,
-    model=default_resp.model,
-    object="chat.completion.chunk",
-    created=default_resp.created,
-    choices=[
-        AChoice(
-            delta=ChoiceDelta(content=resp_content, role="assistant"),
-            finish_reason="stop",
-            index=0,
-            logprobs=None,
-        )
-    ],
-    usage=dict(default_resp.usage),
-)
-
-prompt_msg = "who are you"
-messages = [{"role": "user", "content": prompt_msg}]
-
-
-def test_fireworks_costmanager():
-    cost_manager = FireworksCostManager()
-    assert MODEL_GRADE_TOKEN_COSTS["-1"] == cost_manager.model_grade_token_costs("test")
-    assert MODEL_GRADE_TOKEN_COSTS["-1"] == cost_manager.model_grade_token_costs("xxx-81b-chat")
-    assert MODEL_GRADE_TOKEN_COSTS["16"] == cost_manager.model_grade_token_costs("llama-v2-13b-chat")
-    assert MODEL_GRADE_TOKEN_COSTS["16"] == cost_manager.model_grade_token_costs("xxx-15.5b-chat")
-    assert MODEL_GRADE_TOKEN_COSTS["16"] == cost_manager.model_grade_token_costs("xxx-16b-chat")
-    assert MODEL_GRADE_TOKEN_COSTS["80"] == cost_manager.model_grade_token_costs("xxx-80b-chat")
-    assert MODEL_GRADE_TOKEN_COSTS["mixtral-8x7b"] == cost_manager.model_grade_token_costs("mixtral-8x7b-chat")
-
-    cost_manager.update_cost(prompt_tokens=500000, completion_tokens=500000, model="llama-v2-13b-chat")
-    assert cost_manager.total_cost == 0.5
-
-
-async def mock_openai_acompletions_create(self, stream: bool = False, **kwargs) -> ChatCompletionChunk:
-    if stream:
-
-        class Iterator(object):
-            async def __aiter__(self):
-                yield default_resp_chunk
-
-        return Iterator()
-    else:
-        return default_resp
-
-
-@pytest.mark.asyncio
-async def test_fireworks_acompletion(mocker):
-    mocker.patch("openai.resources.chat.completions.AsyncCompletions.create", mock_openai_acompletions_create)
-
-    fireworks_gpt = FireworksLLM(mock_llm_config)
-    fireworks_gpt.model = "llama-v2-13b-chat"
-
-    fireworks_gpt._update_costs(
-        usage=CompletionUsage(prompt_tokens=500000, completion_tokens=500000, total_tokens=1000000)
-    )
-    assert fireworks_gpt.get_costs() == Costs(
-        total_prompt_tokens=500000, total_completion_tokens=500000, total_cost=0.5, total_budget=0
-    )
-
-    resp = await fireworks_gpt.acompletion(messages)
-    assert resp.choices[0].message.content in resp_content
-
-    resp = await fireworks_gpt.aask(prompt_msg, stream=False)
-    assert resp == resp_content
-
-    resp = await fireworks_gpt.acompletion_text(messages, stream=False)
-    assert resp == resp_content
-
-    resp = await fireworks_gpt.acompletion_text(messages, stream=True)
-    assert resp == resp_content
-
-    resp = await fireworks_gpt.aask(prompt_msg)
-    assert resp == resp_content
--- a/tests/metagpt/provider/test_google_gemini_api.py
+++ b/tests/metagpt/provider/test_google_gemini_api.py
@ -11,6 +11,12 @@ from google.generativeai.types import content_types

 from metagpt.provider.google_gemini_api import GeminiLLM
 from tests.metagpt.provider.mock_llm_config import mock_llm_config
+from tests.metagpt.provider.req_resp_const import (
+    gemini_messages,
+    llm_general_chat_funcs_test,
+    prompt,
+    resp_cont_tmpl,
+)


@dataclass
@ -18,10 +24,8 @@ class MockGeminiResponse(ABC):
    text: str


-prompt_msg = "who are you"
-messages = [{"role": "user", "parts": prompt_msg}]
-resp_content = "I'm gemini from google"
-default_resp = MockGeminiResponse(text=resp_content)
+resp_cont = resp_cont_tmpl.format(name="gemini")
+default_resp = MockGeminiResponse(text=resp_cont)


 def mock_gemini_count_tokens(self, contents: content_types.ContentsType) -> glm.CountTokensResponse:
@ -60,28 +64,18 @@ async def test_gemini_acompletion(mocker):
        mock_gemini_generate_content_async,
    )

-    gemini_gpt = GeminiLLM(mock_llm_config)
+    gemini_llm = GeminiLLM(mock_llm_config)

-    assert gemini_gpt._user_msg(prompt_msg) == {"role": "user", "parts": [prompt_msg]}
-    assert gemini_gpt._assistant_msg(prompt_msg) == {"role": "model", "parts": [prompt_msg]}
+    assert gemini_llm._user_msg(prompt) == {"role": "user", "parts": [prompt]}
+    assert gemini_llm._assistant_msg(prompt) == {"role": "model", "parts": [prompt]}

-    usage = gemini_gpt.get_usage(messages, resp_content)
+    usage = gemini_llm.get_usage(gemini_messages, resp_cont)
    assert usage == {"prompt_tokens": 20, "completion_tokens": 20}

-    resp = gemini_gpt.completion(messages)
+    resp = gemini_llm.completion(gemini_messages)
    assert resp == default_resp

-    resp = await gemini_gpt.acompletion(messages)
+    resp = await gemini_llm.acompletion(gemini_messages)
    assert resp.text == default_resp.text

-    resp = await gemini_gpt.aask(prompt_msg, stream=False)
-    assert resp == resp_content
-
-    resp = await gemini_gpt.acompletion_text(messages, stream=False)
-    assert resp == resp_content
-
-    resp = await gemini_gpt.acompletion_text(messages, stream=True)
-    assert resp == resp_content
-
-    resp = await gemini_gpt.aask(prompt_msg)
-    assert resp == resp_content
+    await llm_general_chat_funcs_test(gemini_llm, prompt, gemini_messages, resp_cont)
--- a/tests/metagpt/provider/test_ollama_api.py
+++ b/tests/metagpt/provider/test_ollama_api.py
@ -9,12 +9,15 @@ import pytest

 from metagpt.provider.ollama_api import OllamaLLM
 from tests.metagpt.provider.mock_llm_config import mock_llm_config
+from tests.metagpt.provider.req_resp_const import (
+    llm_general_chat_funcs_test,
+    messages,
+    prompt,
+    resp_cont_tmpl,
+)

-prompt_msg = "who are you"
-messages = [{"role": "user", "content": prompt_msg}]
-
-resp_content = "I'm ollama"
-default_resp = {"message": {"role": "assistant", "content": resp_content}}
+resp_cont = resp_cont_tmpl.format(name="ollama")
+default_resp = {"message": {"role": "assistant", "content": resp_cont}}


 async def mock_ollama_arequest(self, stream: bool = False, **kwargs) -> Tuple[Any, Any, bool]:
@ -41,19 +44,12 @@ async def mock_ollama_arequest(self, stream: bool = False, **kwargs) -> Tuple[An
 async def test_gemini_acompletion(mocker):
    mocker.patch("metagpt.provider.general_api_requestor.GeneralAPIRequestor.arequest", mock_ollama_arequest)

-    ollama_gpt = OllamaLLM(mock_llm_config)
+    ollama_llm = OllamaLLM(mock_llm_config)

-    resp = await ollama_gpt.acompletion(messages)
+    resp = await ollama_llm.acompletion(messages)
    assert resp["message"]["content"] == default_resp["message"]["content"]

-    resp = await ollama_gpt.aask(prompt_msg, stream=False)
-    assert resp == resp_content
+    resp = await ollama_llm.aask(prompt, stream=False)
+    assert resp == resp_cont

-    resp = await ollama_gpt.acompletion_text(messages, stream=False)
-    assert resp == resp_content
-
-    resp = await ollama_gpt.acompletion_text(messages, stream=True)
-    assert resp == resp_content
-
-    resp = await ollama_gpt.aask(prompt_msg)
-    assert resp == resp_content
+    await llm_general_chat_funcs_test(ollama_llm, prompt, messages, resp_cont)
--- a/tests/metagpt/provider/test_open_llm_api.py
+++ b/tests/metagpt/provider/test_open_llm_api.py
@ -1,92 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# @Desc   :
-
-import pytest
-from openai.types.chat.chat_completion import (
-    ChatCompletion,
-    ChatCompletionMessage,
-    Choice,
-)
-from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
-from openai.types.chat.chat_completion_chunk import Choice as AChoice
-from openai.types.chat.chat_completion_chunk import ChoiceDelta
-from openai.types.completion_usage import CompletionUsage
-
-from metagpt.provider.open_llm_api import OpenLLM
-from metagpt.utils.cost_manager import Costs
-from tests.metagpt.provider.mock_llm_config import mock_llm_config
-
-resp_content = "I'm llama2"
-default_resp = ChatCompletion(
-    id="cmpl-a6652c1bb181caae8dd19ad8",
-    model="llama-v2-13b-chat",
-    object="chat.completion",
-    created=1703302755,
-    choices=[
-        Choice(
-            finish_reason="stop",
-            index=0,
-            message=ChatCompletionMessage(role="assistant", content=resp_content),
-            logprobs=None,
-        )
-    ],
-)
-
-default_resp_chunk = ChatCompletionChunk(
-    id=default_resp.id,
-    model=default_resp.model,
-    object="chat.completion.chunk",
-    created=default_resp.created,
-    choices=[
-        AChoice(
-            delta=ChoiceDelta(content=resp_content, role="assistant"),
-            finish_reason="stop",
-            index=0,
-            logprobs=None,
-        )
-    ],
-)
-
-prompt_msg = "who are you"
-messages = [{"role": "user", "content": prompt_msg}]
-
-
-async def mock_openai_acompletions_create(self, stream: bool = False, **kwargs) -> ChatCompletionChunk:
-    if stream:
-
-        class Iterator(object):
-            async def __aiter__(self):
-                yield default_resp_chunk
-
-        return Iterator()
-    else:
-        return default_resp
-
-
-@pytest.mark.asyncio
-async def test_openllm_acompletion(mocker):
-    mocker.patch("openai.resources.chat.completions.AsyncCompletions.create", mock_openai_acompletions_create)
-
-    openllm_gpt = OpenLLM(mock_llm_config)
-    openllm_gpt.model = "llama-v2-13b-chat"
-
-    openllm_gpt._update_costs(usage=CompletionUsage(prompt_tokens=100, completion_tokens=100, total_tokens=200))
-    assert openllm_gpt.get_costs() == Costs(
-        total_prompt_tokens=100, total_completion_tokens=100, total_cost=0, total_budget=0
-    )
-
-    resp = await openllm_gpt.acompletion(messages)
-    assert resp.choices[0].message.content in resp_content
-
-    resp = await openllm_gpt.aask(prompt_msg, stream=False)
-    assert resp == resp_content
-
-    resp = await openllm_gpt.acompletion_text(messages, stream=False)
-    assert resp == resp_content
-
-    resp = await openllm_gpt.acompletion_text(messages, stream=True)
-    assert resp == resp_content
-
-    resp = await openllm_gpt.aask(prompt_msg)
-    assert resp == resp_content
--- a/tests/metagpt/provider/test_openai.py
+++ b/tests/metagpt/provider/test_openai.py
@ -1,12 +1,11 @@
-import json
-
 import pytest
 from openai.types.chat import (
    ChatCompletion,
+    ChatCompletionChunk,
    ChatCompletionMessage,
    ChatCompletionMessageToolCall,
 )
-from openai.types.chat.chat_completion import Choice
+from openai.types.chat.chat_completion import Choice, CompletionUsage
 from openai.types.chat.chat_completion_message_tool_call import Function
 from PIL import Image

@ -18,6 +17,22 @@ from tests.metagpt.provider.mock_llm_config import (
    mock_llm_config,
    mock_llm_config_proxy,
 )
+from tests.metagpt.provider.req_resp_const import (
+    get_openai_chat_completion,
+    get_openai_chat_completion_chunk,
+    llm_general_chat_funcs_test,
+    messages,
+    prompt,
+    resp_cont_tmpl,
+)
+
+name = "AI assistant"
+resp_cont = resp_cont_tmpl.format(name=name)
+default_resp = get_openai_chat_completion(name)
+
+default_resp_chunk = get_openai_chat_completion_chunk(name, usage_as_dict=True)
+
+usage = CompletionUsage(completion_tokens=110, prompt_tokens=92, total_tokens=202)


@pytest.mark.asyncio
@ -106,9 +121,11 @@ class TestOpenAI:

    def test_aask_code_json_decode_error(self, json_decode_error):
        instance = OpenAILLM(mock_llm_config)
-        with pytest.raises(json.decoder.JSONDecodeError) as e:
-            instance.get_choice_function_arguments(json_decode_error)
-        assert "JSONDecodeError" in str(e)
+        code = instance.get_choice_function_arguments(json_decode_error)
+        assert "code" in code
+        assert "language" in code
+        assert "hello world" in code["code"]
+        logger.info(f'code is : {code["code"]}')


@pytest.mark.asyncio
@ -121,3 +138,29 @@ async def test_gen_image():

    images: list[Image] = await llm.gen_image(model=model, prompt=prompt, resp_format="b64_json")
    assert images[0].size == (1024, 1024)
+
+
+async def mock_openai_acompletions_create(self, stream: bool = False, **kwargs) -> ChatCompletionChunk:
+    if stream:
+
+        class Iterator(object):
+            async def __aiter__(self):
+                yield default_resp_chunk
+
+        return Iterator()
+    else:
+        return default_resp
+
+
+@pytest.mark.asyncio
+async def test_openai_acompletion(mocker):
+    mocker.patch("openai.resources.chat.completions.AsyncCompletions.create", mock_openai_acompletions_create)
+
+    llm = OpenAILLM(mock_llm_config)
+
+    resp = await llm.acompletion(messages)
+    assert resp.choices[0].finish_reason == "stop"
+    assert resp.choices[0].message.content == resp_cont
+    assert resp.usage == usage
+
+    await llm_general_chat_funcs_test(llm, prompt, messages, resp_cont)
--- a/tests/metagpt/provider/test_qianfan_api.py
+++ b/tests/metagpt/provider/test_qianfan_api.py
@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Desc   : the unittest of qianfan api
+
+from typing import AsyncIterator, Union
+
+import pytest
+from qianfan.resources.typing import JsonBody, QfResponse
+
+from metagpt.provider.qianfan_api import QianFanLLM
+from tests.metagpt.provider.mock_llm_config import mock_llm_config_qianfan
+from tests.metagpt.provider.req_resp_const import (
+    get_qianfan_response,
+    llm_general_chat_funcs_test,
+    messages,
+    prompt,
+    resp_cont_tmpl,
+)
+
+name = "ERNIE-Bot-turbo"
+resp_cont = resp_cont_tmpl.format(name=name)
+
+
+def mock_qianfan_do(self, messages: list[dict], model: str, stream: bool = False, system: str = None) -> QfResponse:
+    return get_qianfan_response(name=name)
+
+
+async def mock_qianfan_ado(
+    self, messages: list[dict], model: str, stream: bool = True, system: str = None
+) -> Union[QfResponse, AsyncIterator[QfResponse]]:
+    resps = [get_qianfan_response(name=name)]
+    if stream:
+
+        async def aresp_iterator(resps: list[JsonBody]):
+            for resp in resps:
+                yield resp
+
+        return aresp_iterator(resps)
+    else:
+        return resps[0]
+
+
+@pytest.mark.asyncio
+async def test_qianfan_acompletion(mocker):
+    mocker.patch("qianfan.resources.llm.chat_completion.ChatCompletion.do", mock_qianfan_do)
+    mocker.patch("qianfan.resources.llm.chat_completion.ChatCompletion.ado", mock_qianfan_ado)
+
+    qianfan_llm = QianFanLLM(mock_llm_config_qianfan)
+
+    resp = qianfan_llm.completion(messages)
+    assert resp.get("result") == resp_cont
+
+    resp = await qianfan_llm.acompletion(messages)
+    assert resp.get("result") == resp_cont
+
+    await llm_general_chat_funcs_test(qianfan_llm, prompt, messages, resp_cont)
--- a/tests/metagpt/provider/test_spark_api.py
+++ b/tests/metagpt/provider/test_spark_api.py
@ -4,12 +4,18 @@

 import pytest

-from metagpt.config2 import Config
 from metagpt.provider.spark_api import GetMessageFromWeb, SparkLLM
-from tests.metagpt.provider.mock_llm_config import mock_llm_config
+from tests.metagpt.provider.mock_llm_config import (
+    mock_llm_config,
+    mock_llm_config_spark,
+)
+from tests.metagpt.provider.req_resp_const import (
+    llm_general_chat_funcs_test,
+    prompt,
+    resp_cont_tmpl,
+)

-prompt_msg = "who are you"
-resp_content = "I'm Spark"
+resp_cont = resp_cont_tmpl.format(name="Spark")


 class MockWebSocketApp(object):
@ -23,7 +29,7 @@ class MockWebSocketApp(object):
 def test_get_msg_from_web(mocker):
    mocker.patch("websocket.WebSocketApp", MockWebSocketApp)

-    get_msg_from_web = GetMessageFromWeb(prompt_msg, mock_llm_config)
+    get_msg_from_web = GetMessageFromWeb(prompt, mock_llm_config)
    assert get_msg_from_web.gen_params()["parameter"]["chat"]["domain"] == "mock_domain"

    ret = get_msg_from_web.run()
@ -31,34 +37,26 @@ def test_get_msg_from_web(mocker):


 def mock_spark_get_msg_from_web_run(self) -> str:
-    return resp_content
+    return resp_cont


@pytest.mark.asyncio
-async def test_spark_aask():
-    llm = SparkLLM(Config.from_home("spark.yaml").llm)
+async def test_spark_aask(mocker):
+    mocker.patch("metagpt.provider.spark_api.GetMessageFromWeb.run", mock_spark_get_msg_from_web_run)
+
+    llm = SparkLLM(mock_llm_config_spark)

    resp = await llm.aask("Hello!")
-    print(resp)
+    assert resp == resp_cont


@pytest.mark.asyncio
 async def test_spark_acompletion(mocker):
    mocker.patch("metagpt.provider.spark_api.GetMessageFromWeb.run", mock_spark_get_msg_from_web_run)

-    spark_gpt = SparkLLM(mock_llm_config)
+    spark_llm = SparkLLM(mock_llm_config)

-    resp = await spark_gpt.acompletion([])
-    assert resp == resp_content
+    resp = await spark_llm.acompletion([])
+    assert resp == resp_cont

-    resp = await spark_gpt.aask(prompt_msg, stream=False)
-    assert resp == resp_content
-
-    resp = await spark_gpt.acompletion_text([], stream=False)
-    assert resp == resp_content
-
-    resp = await spark_gpt.acompletion_text([], stream=True)
-    assert resp == resp_content
-
-    resp = await spark_gpt.aask(prompt_msg)
-    assert resp == resp_content
+    await llm_general_chat_funcs_test(spark_llm, prompt, prompt, resp_cont)
--- a/tests/metagpt/provider/test_zhipuai_api.py
+++ b/tests/metagpt/provider/test_zhipuai_api.py
@ -6,22 +6,24 @@ import pytest

 from metagpt.provider.zhipuai_api import ZhiPuAILLM
 from tests.metagpt.provider.mock_llm_config import mock_llm_config_zhipu
+from tests.metagpt.provider.req_resp_const import (
+    get_part_chat_completion,
+    llm_general_chat_funcs_test,
+    messages,
+    prompt,
+    resp_cont_tmpl,
+)

-prompt_msg = "who are you"
-messages = [{"role": "user", "content": prompt_msg}]
-
-resp_content = "I'm chatglm-turbo"
-default_resp = {
-    "choices": [{"finish_reason": "stop", "index": 0, "message": {"content": resp_content, "role": "assistant"}}],
-    "usage": {"completion_tokens": 22, "prompt_tokens": 19, "total_tokens": 41},
-}
+name = "ChatGLM-4"
+resp_cont = resp_cont_tmpl.format(name=name)
+default_resp = get_part_chat_completion(name)


-async def mock_zhipuai_acreate_stream(**kwargs):
+async def mock_zhipuai_acreate_stream(self, **kwargs):
    class MockResponse(object):
        async def _aread(self):
            class Iterator(object):
-                events = [{"choices": [{"index": 0, "delta": {"content": resp_content, "role": "assistant"}}]}]
+                events = [{"choices": [{"index": 0, "delta": {"content": resp_cont, "role": "assistant"}}]}]

                async def __aiter__(self):
                    for event in self.events:
@ -37,7 +39,7 @@ async def mock_zhipuai_acreate_stream(**kwargs):
    return MockResponse()


-async def mock_zhipuai_acreate(**kwargs) -> dict:
+async def mock_zhipuai_acreate(self, **kwargs) -> dict:
    return default_resp


@ -46,22 +48,12 @@ async def test_zhipuai_acompletion(mocker):
    mocker.patch("metagpt.provider.zhipuai.zhipu_model_api.ZhiPuModelAPI.acreate", mock_zhipuai_acreate)
    mocker.patch("metagpt.provider.zhipuai.zhipu_model_api.ZhiPuModelAPI.acreate_stream", mock_zhipuai_acreate_stream)

-    zhipu_gpt = ZhiPuAILLM(mock_llm_config_zhipu)
+    zhipu_llm = ZhiPuAILLM(mock_llm_config_zhipu)

-    resp = await zhipu_gpt.acompletion(messages)
-    assert resp["choices"][0]["message"]["content"] == resp_content
+    resp = await zhipu_llm.acompletion(messages)
+    assert resp["choices"][0]["message"]["content"] == resp_cont

-    resp = await zhipu_gpt.aask(prompt_msg, stream=False)
-    assert resp == resp_content
-
-    resp = await zhipu_gpt.acompletion_text(messages, stream=False)
-    assert resp == resp_content
-
-    resp = await zhipu_gpt.acompletion_text(messages, stream=True)
-    assert resp == resp_content
-
-    resp = await zhipu_gpt.aask(prompt_msg)
-    assert resp == resp_content
+    await llm_general_chat_funcs_test(zhipu_llm, prompt, messages, resp_cont)


 def test_zhipuai_proxy():
--- a/tests/metagpt/roles/ci/test_code_interpreter.py
+++ b/tests/metagpt/roles/ci/test_code_interpreter.py
@ -1,19 +0,0 @@
-import pytest
-
-from metagpt.logs import logger
-from metagpt.roles.ci.code_interpreter import CodeInterpreter
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("auto_run", [(True), (False)])
-async def test_code_interpreter(mocker, auto_run):
-    mocker.patch("metagpt.actions.ci.execute_nb_code.ExecuteNbCode.run", return_value=("a successful run", True))
-    mocker.patch("builtins.input", return_value="confirm")
-
-    requirement = "Run data analysis on sklearn Iris dataset, include a plot"
-    tools = []
-
-    ci = CodeInterpreter(auto_run=auto_run, use_tools=True, tools=tools)
-    rsp = await ci.run(requirement)
-    logger.info(rsp)
-    assert len(rsp.content) > 0
--- a/tests/metagpt/roles/mi/test_interpreter.py
+++ b/tests/metagpt/roles/mi/test_interpreter.py
@ -0,0 +1,23 @@
+import pytest
+
+from metagpt.logs import logger
+from metagpt.roles.mi.interpreter import Interpreter
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("auto_run", [(True), (False)])
+async def test_interpreter(mocker, auto_run):
+    mocker.patch("metagpt.actions.mi.execute_nb_code.ExecuteNbCode.run", return_value=("a successful run", True))
+    mocker.patch("builtins.input", return_value="confirm")
+
+    requirement = "Run data analysis on sklearn Iris dataset, include a plot"
+    tools = []
+
+    mi = Interpreter(auto_run=auto_run, use_tools=True, tools=tools)
+    rsp = await mi.run(requirement)
+    logger.info(rsp)
+    assert len(rsp.content) > 0
+
+    finished_tasks = mi.planner.plan.get_finished_tasks()
+    assert len(finished_tasks) > 0
+    assert len(finished_tasks[0].code) > 0  # check one task to see if code is recorded
--- a/tests/metagpt/roles/mi/test_ml_engineer.py
+++ b/tests/metagpt/roles/mi/test_ml_engineer.py
@ -1,16 +1,16 @@
 import pytest

-from metagpt.actions.ci.execute_nb_code import ExecuteNbCode
+from metagpt.actions.mi.execute_nb_code import ExecuteNbCode
 from metagpt.logs import logger
-from metagpt.roles.ci.ml_engineer import MLEngineer
+from metagpt.roles.mi.ml_engineer import MLEngineer
 from metagpt.schema import Message, Plan, Task
 from metagpt.tools.tool_type import ToolType
-from tests.metagpt.actions.ci.test_debug_code import CODE, DebugContext, ErrorStr
+from tests.metagpt.actions.mi.test_debug_code import CODE, DebugContext, ErrorStr


 def test_mle_init():
-    ci = MLEngineer(goal="test", auto_run=True, use_tools=True, tools=["tool1", "tool2"])
-    assert ci.tools == []
+    mle = MLEngineer(goal="test", auto_run=True, use_tools=True, tools=["tool1", "tool2"])
+    assert mle.tools == []


 MockPlan = Plan(
--- a/tests/metagpt/tools/libs/test_email_login.py
+++ b/tests/metagpt/tools/libs/test_email_login.py
@ -0,0 +1,7 @@
+from metagpt.tools.libs.email_login import email_login_imap
+
+
+def test_email_login(mocker):
+    mock_mailbox = mocker.patch("metagpt.tools.libs.email_login.MailBox.login")
+    mock_mailbox.login.return_value = mocker.Mock()
+    email_login_imap("test@outlook.com", "test_password")
--- a/tests/metagpt/utils/test_mermaid.py
+++ b/tests/metagpt/utils/test_mermaid.py
@ -14,7 +14,7 @@ from metagpt.utils.mermaid import MMC1, mermaid_to_file

@pytest.mark.asyncio
@pytest.mark.parametrize("engine", ["nodejs", "ink"])  # TODO: playwright and pyppeteer
-async def test_mermaid(engine, context):
+async def test_mermaid(engine, context, mermaid_mocker):
    # nodejs prerequisites: npm install -g @mermaid-js/mermaid-cli
    # ink prerequisites: connected to internet
    # playwright prerequisites: playwright install --with-deps chromium
--- a/tests/metagpt/utils/test_repair_llm_raw_output.py
+++ b/tests/metagpt/utils/test_repair_llm_raw_output.py
@ -211,6 +211,11 @@ value
    output = repair_invalid_json(output, "Expecting ',' delimiter: line 4 column 1")
    assert output == target_output

+    raw_output = '{"key": "url "http" \\"https\\" "}'
+    target_output = '{"key": "url \\"http\\" \\"https\\" "}'
+    output = repair_invalid_json(raw_output, "Expecting ',' delimiter: line 1 column 15 (char 14)")
+    assert output == target_output
+

 def test_retry_parse_json_text():
    from metagpt.utils.repair_llm_raw_output import retry_parse_json_text
--- a/tests/metagpt/utils/test_save_code.py
+++ b/tests/metagpt/utils/test_save_code.py
@ -6,7 +6,7 @@
 import nbformat
 import pytest

-from metagpt.actions.ci.execute_nb_code import ExecuteNbCode
+from metagpt.actions.mi.execute_nb_code import ExecuteNbCode
 from metagpt.utils.common import read_json_file
 from metagpt.utils.save_code import DATA_PATH, save_code_file

--- a/tests/metagpt/utils/test_text.py
+++ b/tests/metagpt/utils/test_text.py
@ -42,6 +42,7 @@ def test_reduce_message_length(msgs, model_name, system_text, reserved, expected
        (" ".join("Hello World." for _ in range(1000)), "Prompt: {}", "gpt-3.5-turbo-16k", "System", 3000, 1),
        (" ".join("Hello World." for _ in range(4000)), "Prompt: {}", "gpt-4", "System", 2000, 2),
        (" ".join("Hello World." for _ in range(8000)), "Prompt: {}", "gpt-4-32k", "System", 4000, 1),
+        (" ".join("Hello World" for _ in range(8000)), "Prompt: {}", "gpt-3.5-turbo", "System", 1000, 8),
    ],
 )
 def test_generate_prompt_chunk(text, prompt_template, model_name, system_text, reserved, expected):
--- a/tests/mock/mock_aiohttp.py
+++ b/tests/mock/mock_aiohttp.py
@ -10,6 +10,7 @@ class MockAioResponse:
    check_funcs: dict[tuple[str, str], Callable[[dict], str]] = {}
    rsp_cache: dict[str, str] = {}
    name = "aiohttp"
+    status = 200

    def __init__(self, session, method, url, **kwargs) -> None:
        fn = self.check_funcs.get((method, url))
@ -22,6 +23,7 @@ class MockAioResponse:
    async def __aenter__(self):
        if self.response:
            await self.response.__aenter__()
+            self.status = self.response.status
        elif self.mng:
            self.response = await self.mng.__aenter__()
        return self
@ -41,6 +43,17 @@ class MockAioResponse:
        self.rsp_cache[self.key] = data
        return data

+    @property
+    def content(self):
+        return self
+
+    async def read(self):
+        if self.key in self.rsp_cache:
+            return eval(self.rsp_cache[self.key])
+        data = await self.response.content.read()
+        self.rsp_cache[self.key] = str(data)
+        return data
+
    def raise_for_status(self):
        if self.response:
            self.response.raise_for_status()
--- a/tests/spark.yaml
+++ b/tests/spark.yaml
@ -1,7 +0,0 @@
-llm:
-  api_type: "spark"
-  app_id: "xxx"
-  api_key: "xxx"
-  api_secret: "xxx"
-  domain: "generalv2"
-  base_url: "wss://spark-api.xf-yun.com/v3.1/chat"