update di prompt

2026-07-23 17:01:08 +02:00 · 2024-09-13 16:57:37 +08:00 · 2024-09-13 16:57:37 +08:00 · c54e4121c6
commit c54e4121c6
parent 9f04278383
2 changed files with 17 additions and 7 deletions
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@ -16,15 +16,22 @@ Perform data analysis, data preprocessing, feature engineering, and modeling to
 Report {metric} on the eval data. Do not plot or make any visualizations.
 """

+RECOMMENDATION = """\
+## Base Models and Ensemble
+You can consider using the following base models:
+’GBM’ (LightGBM) ‘CAT’ (CatBoost) ‘XGB’ (XGBoost) ‘RF’ (random forest) ‘XT’ (extremely randomized trees) ‘KNN’ (k-nearest neighbors) ‘LR’ (linear regression)
+"""

-DI_INSTRUCTION = """\
-**Attention** 
+DI_INSTRUCTION = (
+    RECOMMENDATION
+    + """**Attention** 
 1. Please do not leak the target label in any form during training.
 2. Test set does not have the target column.
 3. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
-4. If labels are transformed during training, they should be transformed back to the original format before saving the predictions.
+4. When scaling or transforming features, make sure the target column is not included.
 5. You could utilize dev set to validate and improve model training.
-6. Use techniques to avoid overfitting.
+6. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained
+7. Make sure the model prototyping is fast. 

 ## Saving Dev and Test Predictions
 1. Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. 
@ -37,6 +44,7 @@ Print the train and dev set performance in the last step.
 # Output dir
 {output_dir}
 """
+)

 TASK_PROMPT = """\
 # User requirement
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@ -79,7 +79,7 @@ class InstructionGenerator:
        return data

    @staticmethod
-    async def generate_new_instructions(task_id, original_instruction, max_num, file_path):
+    async def generate_new_instructions(task_id, original_instruction, max_num, file_path, ext_info=None):
        data = InstructionGenerator.load_analysis_pool(file_path, task_id)
        new_instructions = []
        if len(data) == 0:
@ -91,12 +91,14 @@ class InstructionGenerator:
            else:
                item = data[i]
                insights = item["Analysis"]
-            new_instruction = await InstructionGenerator.generate_new_instruction(original_instruction, insights)
+            new_instruction = await InstructionGenerator.generate_new_instruction(
+                original_instruction, insights, ext_info
+            )
            new_instructions.append(new_instruction)
        return new_instructions

    @staticmethod
-    async def generate_new_instruction(original_instruction, insights):
+    async def generate_new_instruction(original_instruction, insights, ext_info):
        prompt = CHANGE_INSTRUCTION.format(instruction=original_instruction, insights=insights)
        llm = LLM()
        context = llm.format_msg([Message(content=prompt, role="user")])