From f856d768fe2630c1482cfb8b568f48e97978acc2 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Sat, 14 Sep 2024 18:05:02 +0800
Subject: [PATCH] remove recommendation from di initial prompt, add
 recommendation to task type prompt

---
 expo/data/dataset.py         | 27 ++-------------------------
 metagpt/prompts/task_type.py |  3 +++
 2 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/expo/data/dataset.py b/expo/data/dataset.py
index 8af0c485e..9748cb8c2 100644
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@@ -27,30 +27,8 @@ STACKING = """
 
 SPECIAL_INSTRUCTIONS = {"ag": USE_AG, "stacking": STACKING}
 
-RECOMMENDATION = """
-## Base Models
-You have access to the following base models:
-Tabular:
-LightGBM, CatBoost, XGBoost, random forest, extremely randomized trees, k-nearest neighbors, linear regression
-
-Image:
-ResNet, DenseNet, VGG, Inception, MobileNet, EfficientNet
-
-Text:
-BERT, RoBERTa, DistilBERT, GPT-2
-"""
-
-# The RECOMMENDATION above is not tested but might be needed for multi-modal datasets
-
-RECOMMENDATION = """
-## Base Models
-You have access to the following base models:
-LightGBM, CatBoost, XGBoost, random forest, extremely randomized trees, k-nearest neighbors, linear regression
-"""
-
-DI_INSTRUCTION = (
-    RECOMMENDATION
-    + """## Attention
+DI_INSTRUCTION = """
+## Attention
 1. Please do not leak the target label in any form during training.
 2. Test set does not have the target column.
 3. When conducting data exploration or analysis, print out the results of your findings.
@@ -69,7 +47,6 @@ Print the train and dev set performance in the last step.
 # Output dir
 {output_dir}
 """
-)
 
 TASK_PROMPT = """
 # User requirement
diff --git a/metagpt/prompts/task_type.py b/metagpt/prompts/task_type.py
index ca0aae572..6b230fc9e 100644
--- a/metagpt/prompts/task_type.py
+++ b/metagpt/prompts/task_type.py
@@ -34,6 +34,9 @@ The current task is about feature engineering. when performing it, please adhere
 # Prompt for taking on "model_train" tasks
 MODEL_TRAIN_PROMPT = """
 The current task is about training a model, please ensure high performance:
+- For tabular datasets - you have access to LightGBM, CatBoost, XGBoost, random forest, extremely randomized trees, k-nearest neighbors, linear regression, etc.
+- For image datasets - you have access to ResNet, VGG, Inception, MobileNet, DenseNet, EfficientNet, etc.
+- For text datasets - you have access to BERT, GPT-2, RoBERTa, DistilBERT, T5, etc.
 - Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as XGBoost, CatBoost, etc.
 - If non-numeric columns exist, perform label encode together with all steps.
 - Use the data from previous task result directly, do not mock or reload data yourself.