update: 更新prompt，给出单step-code示例

2026-06-11 15:15:18 +02:00 · 2023-12-14 10:47:47 +08:00 · 2023-12-14 10:47:47 +08:00 · 41e872a8c0
commit 41e872a8c0
parent 31bd653f07
2 changed files with 29 additions and 9 deletions
--- a/metagpt/prompts/ml_engineer.py
+++ b/metagpt/prompts/ml_engineer.py
@ -208,8 +208,10 @@ Follow steps below when you writing code if it's convenient.
 Each Class tool is described in JSON format. When you call it, import the tool from `{module_name}` first.
 {tool_catalog}

-# Output Example:
-For "fill missing value and handle outliers", the output code be like when there are training data and test data:
+# Step Example:
+Here is a coding example for each code step:
+[Step 1]: Handle missing values by imputing or dropping them. For numerical columns, use median or mean imputation
+[Code]
 ```python
 # Tools used: ['FillMissingValue']
 from metagpt.tools.functions.libs.data_preprocess import FillMissingValue
@ -227,12 +229,26 @@ for col in num_cols:
    train_processed[col] = train_processed[col].clip(low, high)
    test_processed[col] = test_processed[col].clip(low, high)
 ```end
+[Step 2]: xxx
+[Code]:
+```python
+# Tools used: [xxx]
+from metagpt.tools.functions.libs.xxx import
+```end
+[Step 3]: xxx
+[Code]:
+```python
+# Tools used: [xxx]
+from metagpt.tools.functions.libs.xxx import
+```end

 # Constraints:
 - Prioritize using pre-defined tools for the same functionality.
 - Copy DataFrame before processing if needed.
+- Strictly follow the code steps to write code
 """
 #- If 'Code Steps' contains step done in 'Done Tasks', such as reading data, don't repeat it.
+#For "fill missing value and handle outliers", the output code be like when there are training data and test data:

 DATA_PREPROCESS_PROMPT = """
 The current task is about data preprocessing, please note the following:
--- a/metagpt/roles/ml_engineer.py
+++ b/metagpt/roles/ml_engineer.py
@ -33,14 +33,13 @@ STRUCTURAL_CONTEXT = """
 ## Current Task
 {current_task}
 ## Packages Installed
-scikit-learn
 pandas
 numpy
-lightgbm
-xgboost
-catboost
 """
-
+# scikit-learn
+# lightgbm
+# xgboost
+# catboost

 def truncate(result: str, keep_len: int = 1000) -> str:
    desc = "Truncated to show only the last 1000 characters\n"
@ -290,11 +289,16 @@ if __name__ == "__main__":
    
    # requirement = "Perform data analysis on the provided data. Train a model to predict the target variable Survived. Include data preprocessing, feature engineering, and modeling in your pipeline. The metric is accuracy."
    
-    data_path = f"{DATA_PATH}/titanic"
-    requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."
+    # data_path = f"{DATA_PATH}/titanic"
+    # requirement = f"This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."
    # requirement = f"Run data analysis on sklearn Wine recognition dataset, include a plot, and train a model to predict wine class (20% as validation), and show validation accuracy"
    # data_path = f"{DATA_PATH}/icr-identify-age-related-conditions"
    # requirement = f"This is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions.The target column is Class. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report f1 score on the eval data. Train data path: {data_path}/split_train.csv, eval data path: {data_path}/split_eval.csv."
+    # data_path = f"{DATA_PATH}/house-prices-advanced-regression-techniques"
+    # requirement = f"This is a house price dataset, your goal is to predict the sale price of a property based on its features. The target column is SalePrice. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sales price on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."
+    
+    data_path = f"{DATA_PATH}/santander-customer-transaction-prediction"
+    requirement = f"This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report F1 Score on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv' ."
    async def main(requirement: str = requirement, auto_run: bool = True):
        role = MLEngineer(goal=requirement, auto_run=auto_run)
        await role.run(requirement)