From c54e4121c611473ef7ef874d19cfb5891280d091 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Fri, 13 Sep 2024 16:57:37 +0800
Subject: [PATCH] update di prompt

---
 expo/data/dataset.py                   | 16 ++++++++++++----
 expo/insights/instruction_generator.py |  8 +++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/expo/data/dataset.py b/expo/data/dataset.py
index 3b2017d1a..d2ec48326 100644
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@@ -16,15 +16,22 @@ Perform data analysis, data preprocessing, feature engineering, and modeling to
 Report {metric} on the eval data. Do not plot or make any visualizations.
 """
 
+RECOMMENDATION = """\
+## Base Models and Ensemble
+You can consider using the following base models:
+’GBM’ (LightGBM) ‘CAT’ (CatBoost) ‘XGB’ (XGBoost) ‘RF’ (random forest) ‘XT’ (extremely randomized trees) ‘KNN’ (k-nearest neighbors) ‘LR’ (linear regression)
+"""
 
-DI_INSTRUCTION = """\
-**Attention** 
+DI_INSTRUCTION = (
+    RECOMMENDATION
+    + """**Attention** 
 1. Please do not leak the target label in any form during training.
 2. Test set does not have the target column.
 3. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
-4. If labels are transformed during training, they should be transformed back to the original format before saving the predictions.
+4. When scaling or transforming features, make sure the target column is not included.
 5. You could utilize dev set to validate and improve model training.
-6. Use techniques to avoid overfitting.
+6. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained
+7. Make sure the model prototyping is fast. 
 
 ## Saving Dev and Test Predictions
 1. Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. 
@@ -37,6 +44,7 @@ Print the train and dev set performance in the last step.
 # Output dir
 {output_dir}
 """
+)
 
 TASK_PROMPT = """\
 # User requirement
diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py
index c9ff7ec6e..a800f4507 100644
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@@ -79,7 +79,7 @@ class InstructionGenerator:
         return data
 
     @staticmethod
-    async def generate_new_instructions(task_id, original_instruction, max_num, file_path):
+    async def generate_new_instructions(task_id, original_instruction, max_num, file_path, ext_info=None):
         data = InstructionGenerator.load_analysis_pool(file_path, task_id)
         new_instructions = []
         if len(data) == 0:
@@ -91,12 +91,14 @@ class InstructionGenerator:
             else:
                 item = data[i]
                 insights = item["Analysis"]
-            new_instruction = await InstructionGenerator.generate_new_instruction(original_instruction, insights)
+            new_instruction = await InstructionGenerator.generate_new_instruction(
+                original_instruction, insights, ext_info
+            )
             new_instructions.append(new_instruction)
         return new_instructions
 
     @staticmethod
-    async def generate_new_instruction(original_instruction, insights):
+    async def generate_new_instruction(original_instruction, insights, ext_info):
         prompt = CHANGE_INSTRUCTION.format(instruction=original_instruction, insights=insights)
         llm = LLM()
         context = llm.format_msg([Message(content=prompt, role="user")])