tool management at one place, add aask_code mock, azure mock

2026-05-01 20:03:28 +02:00 · 2024-01-11 22:55:31 +08:00 · 2024-01-11 22:55:31 +08:00 · e99c5f29f4
commit e99c5f29f4
parent 9e0b9745be
9 changed files with 167 additions and 74 deletions
--- a/metagpt/actions/write_analysis_code.py
+++ b/metagpt/actions/write_analysis_code.py
@ -20,14 +20,16 @@ from metagpt.prompts.ml_engineer import (
    GENERATE_CODE_PROMPT,
    ML_TOOL_USAGE_PROMPT,
    SELECT_FUNCTION_TOOLS,
-    TASK_MODULE_MAP,
-    TASK_SPECIFIC_PROMPT,
    TOOL_RECOMMENDATION_PROMPT,
    TOOL_USAGE_PROMPT,
 )
 from metagpt.schema import Message, Plan
+from metagpt.tools import TOOL_TYPE_MAPPINGS
 from metagpt.utils.common import create_func_config, remove_comments

+TOOL_TYPE_MODULE = {k: v.module for k, v in TOOL_TYPE_MAPPINGS.items()}
+TOOL_TYPE_USAGE_PROMPT = {k: v.usage_prompt for k, v in TOOL_TYPE_MAPPINGS.items()}
+

 class BaseWriteAnalysisCode(Action):
    DEFAULT_SYSTEM_MSG: str = """You are Code Interpreter, a world-class programmer that can complete any goal by executing code. Strictly follow the plan and generate code step by step. Each step of the code will be executed on the user's machine, and the user will provide the code execution results to you.**Notice: The code for the next step depends on the code for the previous step. Must reuse variables in the lastest other code directly, dont creat it again, it is very import for you. Use !pip install in a standalone block to install missing packages.Usually the libraries you need are already installed.Dont check if packages already imported.**"""  # prompt reference: https://github.com/KillianLucas/open-interpreter/blob/v0.1.4/interpreter/system_message.txt
@ -171,9 +173,11 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
        plan: Plan = None,
        **kwargs,
    ) -> str:
-        task_type = plan.current_task.task_type
-        available_tools = self.available_tools.get(task_type, {})
-        special_prompt = TASK_SPECIFIC_PROMPT.get(task_type, "")
+        tool_type = (
+            plan.current_task.task_type
+        )  # find tool type from task type through exact match, can extend to retrieval in the future
+        available_tools = self.available_tools.get(tool_type, {})
+        special_prompt = TOOL_TYPE_USAGE_PROMPT.get(tool_type, "")
        code_steps = plan.current_task.code_steps

        finished_tasks = plan.get_finished_tasks()
@ -189,10 +193,10 @@ class WriteCodeWithTools(BaseWriteAnalysisCode):
            recommend_tools = await self._tool_recommendation(
                plan.current_task.instruction, code_steps, available_tools
            )
-            tool_catalog = self._parse_recommend_tools(task_type, recommend_tools)
+            tool_catalog = self._parse_recommend_tools(tool_type, recommend_tools)
            logger.info(f"Recommended tools: \n{recommend_tools}")

-            module_name = TASK_MODULE_MAP[task_type]
+            module_name = TOOL_TYPE_MODULE[tool_type]

        tools_instruction = TOOL_USAGE_PROMPT.format(
            special_prompt=special_prompt, module_name=module_name, tool_catalog=tool_catalog
@ -215,9 +219,9 @@ class WriteCodeWithToolsML(WriteCodeWithTools):
        column_info: str = "",
        **kwargs,
    ) -> Tuple[List[Message], str]:
-        task_type = plan.current_task.task_type
-        available_tools = self.available_tools.get(task_type, {})
-        special_prompt = TASK_SPECIFIC_PROMPT.get(task_type, "")
+        tool_type = plan.current_task.task_type
+        available_tools = self.available_tools.get(tool_type, {})
+        special_prompt = TOOL_TYPE_USAGE_PROMPT.get(tool_type, "")
        code_steps = plan.current_task.code_steps

        finished_tasks = plan.get_finished_tasks()
@ -230,10 +234,10 @@ class WriteCodeWithToolsML(WriteCodeWithTools):
            recommend_tools = await self._tool_recommendation(
                plan.current_task.instruction, code_steps, available_tools
            )
-            tool_catalog = self._parse_recommend_tools(task_type, recommend_tools)
+            tool_catalog = self._parse_recommend_tools(tool_type, recommend_tools)
            logger.info(f"Recommended tools: \n{recommend_tools}")

-            module_name = TASK_MODULE_MAP[task_type]
+            module_name = TOOL_TYPE_MODULE[tool_type]

            prompt = ML_TOOL_USAGE_PROMPT.format(
                user_requirement=plan.goal,
--- a/metagpt/actions/write_plan.py
+++ b/metagpt/actions/write_plan.py
@ -12,6 +12,7 @@ from metagpt.actions import Action
 from metagpt.logs import logger
 from metagpt.prompts.ml_engineer import ASSIGN_TASK_TYPE_CONFIG, ASSIGN_TASK_TYPE_PROMPT
 from metagpt.schema import Message, Plan, Task
+from metagpt.tools import TOOL_TYPE_MAPPINGS
 from metagpt.utils.common import CodeParser, create_func_config


@ -46,7 +47,10 @@ class WritePlan(Action):
            List[Dict]: tasks with task type assigned
        """
        task_list = "\n".join([f"Task {task['task_id']}: {task['instruction']}" for task in tasks])
-        prompt = ASSIGN_TASK_TYPE_PROMPT.format(task_list=task_list)
+        task_type_desc = "\n".join([f"- **{item.name}**: {item.desc}" for item in TOOL_TYPE_MAPPINGS.values()])
+        prompt = ASSIGN_TASK_TYPE_PROMPT.format(
+            task_list=task_list, task_type_desc=task_type_desc
+        )  # task types are set to be the same as tool types, for now
        tool_config = create_func_config(ASSIGN_TASK_TYPE_CONFIG)
        rsp = await self.llm.aask_code(prompt, **tool_config)
        task_type_list = rsp["task_type"]
--- a/metagpt/prompts/ml_engineer.py
+++ b/metagpt/prompts/ml_engineer.py
@ -54,11 +54,7 @@ Please assign a task type to each task in the list below from the given categori
 {task_list}

 ## All Task Type:
- **feature_engineering**: Only for creating new columns for input data.
- **data_preprocess**: Only for changing value inplace.
- **model_train**: Only for training model.
- **model_evaluate**: Only for evaluating model.
- **other**: Any tasks that do not fit into the previous categories, such as visualization, summarizing findings, etc.
+{task_type_desc}
 """

 ASSIGN_TASK_TYPE_CONFIG = {
@ -278,52 +274,3 @@ for col in num_cols:
 - The output code should contain all steps implemented correctly in 'Code Steps'.
 """
 # - If 'Code Steps' contains step done in 'Done Tasks', such as reading data, don't repeat it.
-
-DATA_PREPROCESS_PROMPT = """
-The current task is about data preprocessing, please note the following:
- Monitor data types per column, applying appropriate methods.
- Ensure operations are on existing dataset columns.
- Avoid writing processed data to files.
- Avoid any change to label column, such as standardization, etc.
- Prefer alternatives to one-hot encoding for categorical data.
- Only encode or scale necessary columns to allow for potential feature-specific engineering tasks (like time_extract, binning, extraction, etc.) later.
- Each step do data preprocessing to train, must do same for test separately at the same time.
-"""
-
-FEATURE_ENGINEERING_PROMPT = """
-The current task is about feature engineering. when performing it, please adhere to the following principles:
- Generate as diverse features as possible to improve the model's performance step-by-step. 
- If potential impactful features are not included in 'Code Steps', add new steps to generate them.
- Avoid creating redundant or excessively numerous features in one step.
- Exclude ID columns from feature generation and remove them.
- Each step do feature engineering to train, must do same for test separately at the same time.
- Avoid using the label column to create features, except for cat encoding.
- Use the data from previous task result if exist, do not mock or reload data yourself.
-"""
-
-MODEL_TRAIN_PROMPT = """
-The current task is about training a model, please ensure high performance:
- Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as lightGBM, XGBoost, CatBoost, etc.
- If non-numeric columns exist, perform label encode together with all steps.
- Use the data from previous task result directly, do not mock or reload data yourself.
- Set suitable hyperparameters for the model, make metrics as high as possible.
-"""
-
-MODEL_EVALUATE_PROMPT = """
-The current task is about evaluating a model, please note the following:
- Ensure that the evaluated data is same processed as the training data. If not, remember use object in 'Done Tasks' to transform the data.
- Use trained model from previous task result directly, do not mock or reload model yourself.
-"""
-
-TASK_SPECIFIC_PROMPT = {
-    "data_preprocess": DATA_PREPROCESS_PROMPT,
-    "feature_engineering": FEATURE_ENGINEERING_PROMPT,
-    "model_train": MODEL_TRAIN_PROMPT,
-    "model_evaluate": MODEL_EVALUATE_PROMPT,
-}
-
-TASK_MODULE_MAP = {
-    "data_preprocess": "metagpt.tools.functions.libs.data_preprocess",
-    "feature_engineering": "metagpt.tools.functions.libs.feature_engineering",
-    "udf": "metagpt.tools.functions.libs.udf",
-}
--- a/metagpt/prompts/tool_type.py
+++ b/metagpt/prompts/tool_type.py
@ -0,0 +1,35 @@
+DATA_PREPROCESS_PROMPT = """
+The current task is about data preprocessing, please note the following:
+- Monitor data types per column, applying appropriate methods.
+- Ensure operations are on existing dataset columns.
+- Avoid writing processed data to files.
+- Avoid any change to label column, such as standardization, etc.
+- Prefer alternatives to one-hot encoding for categorical data.
+- Only encode or scale necessary columns to allow for potential feature-specific engineering tasks (like time_extract, binning, extraction, etc.) later.
+- Each step do data preprocessing to train, must do same for test separately at the same time.
+"""
+
+FEATURE_ENGINEERING_PROMPT = """
+The current task is about feature engineering. when performing it, please adhere to the following principles:
+- Generate as diverse features as possible to improve the model's performance step-by-step. 
+- If potential impactful features are not included in 'Code Steps', add new steps to generate them.
+- Avoid creating redundant or excessively numerous features in one step.
+- Exclude ID columns from feature generation and remove them.
+- Each step do feature engineering to train, must do same for test separately at the same time.
+- Avoid using the label column to create features, except for cat encoding.
+- Use the data from previous task result if exist, do not mock or reload data yourself.
+"""
+
+MODEL_TRAIN_PROMPT = """
+The current task is about training a model, please ensure high performance:
+- Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as lightGBM, XGBoost, CatBoost, etc.
+- If non-numeric columns exist, perform label encode together with all steps.
+- Use the data from previous task result directly, do not mock or reload data yourself.
+- Set suitable hyperparameters for the model, make metrics as high as possible.
+"""
+
+MODEL_EVALUATE_PROMPT = """
+The current task is about evaluating a model, please note the following:
+- Ensure that the evaluated data is same processed as the training data. If not, remember use object in 'Done Tasks' to transform the data.
+- Use trained model from previous task result directly, do not mock or reload model yourself.
+"""
--- a/metagpt/tools/init.py
+++ b/metagpt/tools/init.py
@ -9,6 +9,16 @@

 from enum import Enum

+from pydantic import BaseModel
+
+from metagpt.const import TOOL_SCHEMA_PATH
+from metagpt.prompts.tool_type import (
+    DATA_PREPROCESS_PROMPT,
+    FEATURE_ENGINEERING_PROMPT,
+    MODEL_TRAIN_PROMPT,
+    MODEL_EVALUATE_PROMPT,
+)
+

 class SearchEngineType(Enum):
    SERPAPI_GOOGLE = "serpapi"
@ -27,3 +37,44 @@ class WebBrowserEngineType(Enum):
    def __missing__(cls, key):
        """Default type conversion"""
        return cls.CUSTOM
+
+
+class ToolType(BaseModel):
+    name: str
+    module: str = ""
+    desc: str
+    usage_prompt: str = ""
+
+
+TOOL_TYPE_MAPPINGS = {
+    "data_preprocess": ToolType(
+        name="data_preprocess",
+        module=str(TOOL_SCHEMA_PATH / "data_preprocess"),
+        desc="Only for changing value inplace.",
+        usage_prompt=DATA_PREPROCESS_PROMPT,
+    ),
+    "feature_engineering": ToolType(
+        name="feature_engineering",
+        module=str(TOOL_SCHEMA_PATH / "feature_engineering"),
+        desc="Only for creating new columns for input data.",
+        usage_prompt=FEATURE_ENGINEERING_PROMPT,
+    ),
+    "model_train": ToolType(
+        name="model_train",
+        module="",
+        desc="Only for training model.",
+        usage_prompt=MODEL_TRAIN_PROMPT,
+    ),
+    "model_evaluate": ToolType(
+        name="model_evaluate",
+        module="",
+        desc="Only for evaluating model.",
+        usage_prompt=MODEL_EVALUATE_PROMPT,
+    ),
+    "other": ToolType(
+        name="other",
+        module="",
+        desc="Any tasks that do not fit into the previous categories",
+        usage_prompt="",
+    ),
+}
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -34,14 +34,14 @@ def rsp_cache():
    rsp_cache_file_path = TEST_DATA_PATH / "rsp_cache.json"  # read repo-provided
    new_rsp_cache_file_path = TEST_DATA_PATH / "rsp_cache_new.json"  # exporting a new copy
    if os.path.exists(rsp_cache_file_path):
-        with open(rsp_cache_file_path, "r") as f1:
+        with open(rsp_cache_file_path, "r", encoding="utf-8") as f1:
            rsp_cache_json = json.load(f1)
    else:
        rsp_cache_json = {}
    yield rsp_cache_json
-    with open(rsp_cache_file_path, "w") as f2:
+    with open(rsp_cache_file_path, "w", encoding="utf-8") as f2:
        json.dump(rsp_cache_json, f2, indent=4, ensure_ascii=False)
-    with open(new_rsp_cache_file_path, "w") as f2:
+    with open(new_rsp_cache_file_path, "w", encoding="utf-8") as f2:
        json.dump(RSP_CACHE_NEW, f2, indent=4, ensure_ascii=False)


@ -60,6 +60,7 @@ def llm_mock(rsp_cache, mocker, request):
    llm.rsp_cache = rsp_cache
    mocker.patch("metagpt.provider.base_llm.BaseLLM.aask", llm.aask)
    mocker.patch("metagpt.provider.base_llm.BaseLLM.aask_batch", llm.aask_batch)
+    mocker.patch("metagpt.provider.openai_api.OpenAILLM.aask_code", llm.aask_code)
    yield mocker
    if hasattr(request.node, "test_outcome") and request.node.test_outcome.passed:
        if llm.rsp_candidates:
@ -67,7 +68,7 @@ def llm_mock(rsp_cache, mocker, request):
                cand_key = list(rsp_candidate.keys())[0]
                cand_value = list(rsp_candidate.values())[0]
                if cand_key not in llm.rsp_cache:
-                    logger.info(f"Added '{cand_key[:100]} ... -> {cand_value[:20]} ...' to response cache")
+                    logger.info(f"Added '{cand_key[:100]} ... -> {str(cand_value)[:20]} ...' to response cache")
                    llm.rsp_cache.update(rsp_candidate)
                RSP_CACHE_NEW.update(rsp_candidate)

--- a/tests/metagpt/actions/test_write_plan.py
+++ b/tests/metagpt/actions/test_write_plan.py
@ -1,4 +1,12 @@
-from metagpt.actions.write_plan import Plan, Task, precheck_update_plan_from_rsp
+import pytest
+
+from metagpt.actions.write_plan import (
+    Plan,
+    Task,
+    WritePlan,
+    precheck_update_plan_from_rsp,
+)
+from metagpt.schema import Message


 def test_precheck_update_plan_from_rsp():
@ -12,3 +20,12 @@ def test_precheck_update_plan_from_rsp():
    invalid_rsp = "wrong"
    success, _ = precheck_update_plan_from_rsp(invalid_rsp, plan)
    assert not success
+
+
+@pytest.mark.asyncio
+async def test_write_plan():
+    rsp = await WritePlan().run(context=[Message("run analysis on sklearn iris dataset", role="user")])
+
+    assert "task_id" in rsp
+    assert "instruction" in rsp
+    assert "json" not in rsp  # the output should be the content inside ```json ```
--- a/tests/metagpt/roles/test_code_interpreter.py
+++ b/tests/metagpt/roles/test_code_interpreter.py
@ -0,0 +1,13 @@
+import pytest
+
+from metagpt.logs import logger
+from metagpt.roles.code_interpreter import CodeInterpreter
+
+
+@pytest.mark.asyncio
+async def test_code_interpreter():
+    requirement = "Run data analysis on sklearn Iris dataset, include a plot"
+    ci = CodeInterpreter(goal=requirement, auto_run=True, use_tools=False)
+    rsp = await ci.run(requirement)
+    logger.info(rsp)
+    assert len(rsp.content) > 0
--- a/tests/mock/mock_llm.py
+++ b/tests/mock/mock_llm.py
@ -1,10 +1,16 @@
-from typing import Optional
+import json
+from typing import Optional, Union

+from metagpt.config import CONFIG
 from metagpt.logs import log_llm_stream, logger
+from metagpt.provider.azure_openai_api import AzureOpenAILLM
 from metagpt.provider.openai_api import OpenAILLM
+from metagpt.schema import Message
+
+OriginalLLM = OpenAILLM if not CONFIG.openai_api_type else AzureOpenAILLM


-class MockLLM(OpenAILLM):
+class MockLLM(OriginalLLM):
    def __init__(self, allow_open_api_call):
        super().__init__()
        self.allow_open_api_call = allow_open_api_call
@ -58,6 +64,15 @@ class MockLLM(OpenAILLM):
            context.append(self._assistant_msg(rsp_text))
        return self._extract_assistant_rsp(context)

+    async def original_aask_code(self, messages: Union[str, Message, list[dict]], **kwargs) -> dict:
+        """
+        A copy of metagpt.provider.openai_api.OpenAILLM.aask_code, we can't use super().aask because it will be mocked.
+        Since openai_api.OpenAILLM.aask_code is different from base_llm.BaseLLM.aask_code, we use the former.
+        """
+        messages = self._process_message(messages)
+        rsp = await self._achat_completion_function(messages, **kwargs)
+        return self.get_choice_function_arguments(rsp)
+
    async def aask(
        self,
        msg: str,
@ -78,6 +93,12 @@ class MockLLM(OpenAILLM):
        rsp = await self._mock_rsp(msg_key, self.original_aask_batch, msgs, timeout)
        return rsp

+    async def aask_code(self, messages: Union[str, Message, list[dict]], **kwargs) -> dict:
+        messages = self._process_message(messages)
+        msg_key = json.dumps(messages, ensure_ascii=False)
+        rsp = await self._mock_rsp(msg_key, self.original_aask_code, messages, **kwargs)
+        return rsp
+
    async def _mock_rsp(self, msg_key, ask_func, *args, **kwargs):
        if msg_key not in self.rsp_cache:
            if not self.allow_open_api_call: