diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py new file mode 100644 index 000000000..7f798a098 --- /dev/null +++ b/metagpt/prompts/ml_engineer.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2023/11/24 15:43 +# @Author : lidanyang +# @File : ml_engineer +# @Desc : +TOOL_RECOMMENDATION_PROMPT = """ +## Comprehensive Task Description: +{task} + +This task is divided into several steps, and you need to select the most suitable tools for each step. A tool means a function that can be used to help you solve the task. + +## Detailed Code Steps for the Task: +{code_steps} + +## List of Available Tools: +{available_tools} + +## Tool Selection and Instructions: +- For each code step listed above, choose up to five tools that are most likely to be useful in solving the task. +- If you believe that no tools are suitable for a step, indicate with an empty list. +- Only list the names of the tools, not the full schema of each tool. +- The result should only contain tool names that are in the list of available tools. +- The result list should be in the same order as the code steps. +""" + +SELECT_FUNCTION_TOOLS = { + "name": "select_function_tools", + "description": "Given code steps to generate full code for a task, select suitable tools for each step by order.", + "parameters": { + "type": "object", + "properties": { + "recommend_tools": { + "type": "array", + "description": "List of tool names for each code step. Empty list if no tool is suitable.", + "items": { + "type": "array", + "items": { + "type": "string", + }, + }, + }, + }, + "required": ["recommend_tools"], + }, +} + + +CODE_GENERATOR_WITH_TOOLS = { + "name": "add_subtask_code", + "description": "Add new code of current subtask to the end of an active Jupyter notebook.", + "parameters": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "The code to be added.", + }, + }, + "required": ["code"], + }, +} + +TOO_ORGANIZATION_PROMPT = """ +As a senior data scientist, your role involves developing code for a specific sub-task within a larger project. This project is divided into several sub-tasks, which may either be new challenges or extensions of previous work. + +## Sub-tasks Overview +Here's a list of all the sub-tasks, indicating their current status (DONE or TODO). Your responsibility is the first TODO task on this list. +{all_tasks} + +## Historical Code (Previously Done Sub-tasks): +This code, already executed in the Jupyter notebook, is critical for understanding the background and foundation for your current task. +```python +{completed_code} +``` + +## Dataset Description: +Details about the dataset for the project: +{data_desc} + +## Current Task Notion: +{special_prompt} + +## Code Steps for Your Sub-task: +Follow these steps to complete your current TODO task. You may use external Python functions or write custom code as needed. Ensure your code is self-contained. +{code_steps} + +When you call a function, you should import the function from `{module_name}` first, e.g.: +```python +from metagpt.tools.functions.libs.feature_engineering import fill_missing_value +``` + +## Available Functions for Each Step: +Each function is described in JSON format, including the function name and parameters. {output_desc} +{available_tools} + +## Your Output Format: +Generate the complete code for every step, listing any used function tools at the beginning of the step: +```python +# Step 1 +# Tools used: [function names or 'none'] + + +# Step 2 +# Tools used: [function names or 'none'] + + +# Continue with additional steps, following the same format... +```end + +*** Important Rules *** +- Use only the tools designated for each code step. +- Your output should only include code for the current sub-task. Don't repeat historical code. +- Only mention functions in comments if used in the code. +- Ensure the output new code is executable in the current Jupyter notebook environment, with all historical code executed. +""" + + +DATA_PREPROCESS_PROMPT = """ +In data preprocessing, closely monitor each column's data type. Apply suitable methods for various types (numerical, categorical, datetime, textual, etc.) to ensure the pandas.DataFrame is correctly formatted. +Additionally, ensure that the columns being processed must be the ones that actually exist in the dataset. +""" + +FEATURE_ENGINEERING_PROMPT = """ +""" + +CLASSIFICATION_MODEL_PROMPT = """ +""" + +REGRESSION_MODEL_PROMPT = """ +""" + + +DATA_PREPROCESS_OUTPUT_DESC = "Please note that all functions uniformly output a processed pandas.DataFrame, facilitating seamless integration into the broader workflow." + +FEATURE_ENGINEERING_OUTPUT_DESC = "" + +CLASSIFICATION_MODEL_OUTPUT_DESC = "" + +REGRESSION_MODEL_OUTPUT_DESC = "" + + +ML_SPECIFIC_PROMPT = { + "data_preprocess": DATA_PREPROCESS_PROMPT, + "feature_engineering": FEATURE_ENGINEERING_PROMPT, + "classification_model": CLASSIFICATION_MODEL_PROMPT, + "regression_model": REGRESSION_MODEL_PROMPT, +} + +TOOL_OUTPUT_DESC = { + "data_preprocess": DATA_PREPROCESS_OUTPUT_DESC, + "feature_engineering": FEATURE_ENGINEERING_OUTPUT_DESC, + "classification_model": CLASSIFICATION_MODEL_OUTPUT_DESC, + "regression_model": REGRESSION_MODEL_OUTPUT_DESC, +} + +ML_MODULE_MAP = { + "data_preprocess": "metagpt.tools.functions.libs.machine_learning.data_preprocess", + "feature_engineering": "metagpt.tools.functions.libs.machine_learning.feature_engineering", + "classification_model": "metagpt.tools.functions.libs.machine_learning.ml_model", + "regression_model": "metagpt.tools.functions.libs.machine_learning.ml_model", +}