From 8beca0faddd33b981d23d875c1a59df0b71947f0 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Sat, 14 Sep 2024 15:17:42 +0800
Subject: [PATCH] 1. add special instruction 2. add fixed insights

---
 expo/MCTS.py                           | 24 ++++++++-------
 expo/README.md                         | 24 ++++++++++-----
 expo/data/dataset.py                   | 42 ++++++++++++++++++--------
 expo/experimenter/aug.py               |  4 ++-
 expo/experimenter/custom.py            |  7 ++++-
 expo/experimenter/experimenter.py      |  3 +-
 expo/experimenter/mcts.py              | 12 +++-----
 expo/insights/fixed_insights.json      | 22 ++++++++++++++
 expo/insights/instruction_generator.py | 15 +++++++--
 expo/requirements.txt                  |  1 +
 expo/run_experiment.py                 |  4 ++-
 11 files changed, 111 insertions(+), 47 deletions(-)
 create mode 100644 expo/insights/fixed_insights.json

diff --git a/expo/MCTS.py b/expo/MCTS.py
index 360baac8d..265356f65 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -15,18 +15,18 @@ from metagpt.tools.tool_recommend import ToolRecommender
 from metagpt.utils.common import read_json_file
 
 
-def initialize_di_root_node(task, data_config, low_is_better=False, reflection=True, name=""):
+def initialize_di_root_node(state, reflection: bool = True):
     start_task_id = 2
-    state = create_initial_state(
-        task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
-    )
+    # state = create_initial_state(
+    #     task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
+    # )
     role = ResearchAssistant(
         node_id="0", start_task_id=start_task_id, use_reflection=reflection, role_dir=state["node_dir"]
     )
     return role, Node(parent=None, state=state, action=None, value=0)
 
 
-def create_initial_state(task, start_task_id, data_config, low_is_better, name):
+def create_initial_state(task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str):
     initial_state = {
         "task": task,
         "work_dir": data_config["work_dir"],
@@ -34,7 +34,9 @@ def create_initial_state(task, start_task_id, data_config, low_is_better, name):
         "dataset_config": data_config["datasets"][task],
         "datasets_dir": get_split_dataset_path(task, data_config),
         "exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"),
-        "requirement": generate_task_requirement(task, data_config),
+        "requirement": generate_task_requirement(
+            task, data_config, is_di=True, special_instruction=special_instruction
+        ),
         "has_run": False,
         "start_task_id": start_task_id,
         "low_is_better": low_is_better,
@@ -157,6 +159,7 @@ class Node:
             original_instruction=original_instruction,
             max_num=max_children,
             file_path=self.state["exp_pool_path"],
+            use_fixed_insights=self.use_fixed_insights,
         )
         new_state = self.state.copy()
         new_state["start_task_id"] += 1
@@ -234,9 +237,10 @@ class MCTS:
     c_explore: float = 1.4
     c_unvisited: float = 0.8
 
-    def __init__(self, root_node, max_depth):
+    def __init__(self, root_node, max_depth, use_fixed_insights):
         self.root_node = root_node
         self.max_depth = max_depth
+        self.use_fixed_insights = use_fixed_insights
 
     def select(self, node: Node):
         node = self.best_child()
@@ -303,10 +307,8 @@ class MCTS:
     def get_num_simulations(self):
         return self.root_node.visited
 
-    async def search(self, task, data_config, name, rollouts, load_tree=False, low_is_better=False, reflection=False):
-        role, root = initialize_di_root_node(
-            task, data_config, low_is_better=low_is_better, reflection=reflection, name=name
-        )
+    async def search(self, state, rollouts, load_tree=False, reflection=False):
+        role, root = initialize_di_root_node(state, reflection=reflection)
         self.root_node = root
         tree_loaded = False
         if load_tree:
diff --git a/expo/README.md b/expo/README.md
index 55ea7eed4..00d1cae50 100644
--- a/expo/README.md
+++ b/expo/README.md
@@ -187,16 +187,10 @@ ### Base DI
 For setup, check 5.
 
 - `python run_experiment.py --exp_mode base --task titanic --num_experiments 10`
+- Ask DI to use AutoGluon: `--special_instruction ag`
+- Ask DI to use the stacking ensemble method: `--special_instruction stacking`
 
 
-### DI RandomSearch
-For setup, check 5.
-
-- Single insight
-`python run_experiment.py --exp_mode aug --task titanic --aug_mode single`
-
-- Set insight
-`python run_experiment.py --exp_mode aug --task titanic --aug_mode set`
 
 
 ## 5. DI MCTS
@@ -223,6 +217,20 @@ #### Run
 - `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 10 --low_is_better`
 
 
+In addition to the generated insights, include the fixed insights saved in `insights/fixed_insights.json`
+- `--use_fixed_insights`
+  
+
+
+#### Ablation Study
+
+**DI RandomSearch**
+
+- Single insight
+`python run_experiment.py --exp_mode aug --task titanic --aug_mode single`
+
+- Set insight
+`python run_experiment.py --exp_mode aug --task titanic --aug_mode set`
 
 
 
diff --git a/expo/data/dataset.py b/expo/data/dataset.py
index d2ec48326..03b80985a 100644
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@@ -10,16 +10,27 @@ from sklearn.model_selection import train_test_split
 
 from expo.insights.solution_designer import SolutionDesigner
 
-BASE_USER_REQUIREMENT = """\
+BASE_USER_REQUIREMENT = """
 This is a {datasetname} dataset. Your goal is to predict the target column `{target_col}`.
 Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. 
 Report {metric} on the eval data. Do not plot or make any visualizations.
 """
 
-RECOMMENDATION = """\
+USE_AG = """
+7. Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.
+"""
+
+STACKING = """
+7. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor.
+8. You could do some quick model prototyping to see which models work best and then use them in the ensemble. 
+"""
+
+SPECIAL_INSTRUCTIONS = {"ag": USE_AG, "stacking": STACKING}
+
+RECOMMENDATION = """
 ## Base Models and Ensemble
 You can consider using the following base models:
-’GBM’ (LightGBM) ‘CAT’ (CatBoost) ‘XGB’ (XGBoost) ‘RF’ (random forest) ‘XT’ (extremely randomized trees) ‘KNN’ (k-nearest neighbors) ‘LR’ (linear regression)
+`GBM` (LightGBM) `CAT` (CatBoost) `XGB` (XGBoost) `RF` (random forest) `XT` (extremely randomized trees) `KNN` (k-nearest neighbors) ‘LR’ (linear regression)
 """
 
 DI_INSTRUCTION = (
@@ -27,11 +38,10 @@ DI_INSTRUCTION = (
     + """**Attention** 
 1. Please do not leak the target label in any form during training.
 2. Test set does not have the target column.
-3. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
-4. When scaling or transforming features, make sure the target column is not included.
-5. You could utilize dev set to validate and improve model training.
-6. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained
-7. Make sure the model prototyping is fast. 
+3. When conducting data exploration or analysis, print out the results of your findings.
+4. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
+5. When scaling or transforming features, make sure the target column is not included.
+6. You could utilize dev set to validate and improve model training. {special_instruction}
 
 ## Saving Dev and Test Predictions
 1. Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. 
@@ -46,7 +56,7 @@ Print the train and dev set performance in the last step.
 """
 )
 
-TASK_PROMPT = """\
+TASK_PROMPT = """
 # User requirement
 {user_requirement}
 {additional_instruction}
@@ -142,12 +152,18 @@ def create_dataset_dict(dataset):
     return dataset_dict
 
 
-def generate_di_instruction(output_dir):
-    additional_instruction = DI_INSTRUCTION.format(output_dir=output_dir)
+def generate_di_instruction(output_dir, special_instruction):
+    if special_instruction:
+        special_instruction_prompt = SPECIAL_INSTRUCTIONS[special_instruction]
+    else:
+        special_instruction_prompt = ""
+    additional_instruction = DI_INSTRUCTION.format(
+        output_dir=output_dir, special_instruction=special_instruction_prompt
+    )
     return additional_instruction
 
 
-def generate_task_requirement(task_name, data_config, is_di=True):
+def generate_task_requirement(task_name, data_config, is_di=True, special_instruction=None):
     user_requirement = get_user_requirement(task_name, data_config)
     split_dataset_path = get_split_dataset_path(task_name, data_config)
     train_path = split_dataset_path["train"]
@@ -158,7 +174,7 @@ def generate_task_requirement(task_name, data_config, is_di=True):
     datasets_dir = data_config["datasets_dir"]
     data_info_path = f"{datasets_dir}/{task_name}/dataset_info.json"
     if is_di:
-        additional_instruction = generate_di_instruction(output_dir)
+        additional_instruction = generate_di_instruction(output_dir, special_instruction)
     else:
         additional_instruction = ""
     user_requirement = TASK_PROMPT.format(
diff --git a/expo/experimenter/aug.py b/expo/experimenter/aug.py
index 8312f57fc..e57d024bd 100644
--- a/expo/experimenter/aug.py
+++ b/expo/experimenter/aug.py
@@ -17,7 +17,9 @@ class AugExperimenter(Experimenter):
         # state = create_initial_state(self.args.task, start_task_id=1, data_config=self.data_config, low_is_better=self.args.low_is_better, name="")
         user_requirement = self.state["requirement"]
         exp_pool_path = get_exp_pool_path(self.args.task, self.data_config, pool_name="ds_analysis_pool")
-        exp_pool = InstructionGenerator.load_analysis_pool(exp_pool_path)
+        exp_pool = InstructionGenerator.load_analysis_pool(
+            exp_pool_path, use_fixed_insights=self.args.use_fixed_insights
+        )
         if self.args.aug_mode == "single":
             exps = InstructionGenerator._random_sample(exp_pool, self.args.num_experiments)
             exps = [exp["Analysis"] for exp in exps]
diff --git a/expo/experimenter/custom.py b/expo/experimenter/custom.py
index df090fb58..92b7dafa2 100644
--- a/expo/experimenter/custom.py
+++ b/expo/experimenter/custom.py
@@ -18,7 +18,12 @@ class CustomExperimenter(Experimenter):
         self.name = kwargs.get("name", "")
         self.result_path = f"results/custom_{self.name}"
         self.state = create_initial_state(
-            self.task, start_task_id=1, data_config=self.data_config, low_is_better=self.low_is_better, name=self.name
+            self.task,
+            start_task_id=1,
+            data_config=self.data_config,
+            low_is_better=self.low_is_better,
+            name=self.name,
+            special_instruction=self.args.special_instruction,
         )
 
     def run_experiment(self):
diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py
index 418e0089a..89d589d7d 100644
--- a/expo/experimenter/experimenter.py
+++ b/expo/experimenter/experimenter.py
@@ -23,7 +23,8 @@ class Experimenter:
             start_task_id=1,
             data_config=self.data_config,
             low_is_better=self.args.low_is_better,
-            name="",
+            name=self.args.name,
+            special_instruction=self.args.special_instruction,
         )
 
     async def run_di(self, di, user_requirement, run_idx):
diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py
index fbe2f35f1..e06169a70 100644
--- a/expo/experimenter/mcts.py
+++ b/expo/experimenter/mcts.py
@@ -13,19 +13,15 @@ class MCTSExperimenter(Experimenter):
 
     async def run_experiment(self):
         if self.tree_mode == "greedy":
-            mcts = Greedy(root_node=None, max_depth=5)
+            mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
         elif self.tree_mode == "random":
-            mcts = Random(root_node=None, max_depth=5)
+            mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
         else:
-            mcts = MCTS(root_node=None, max_depth=5)
+            mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
         best_nodes = await mcts.search(
-            self.args.task,
-            self.data_config,
-            low_is_better=self.args.low_is_better,
-            load_tree=self.args.load_tree,
+            state=self.state,
             reflection=self.args.reflection,
             rollouts=self.args.rollouts,
-            name=self.args.name,
         )
         best_node = best_nodes["global_best"]
         dev_best_node = best_nodes["dev_best"]
diff --git a/expo/insights/fixed_insights.json b/expo/insights/fixed_insights.json
new file mode 100644
index 000000000..e52745707
--- /dev/null
+++ b/expo/insights/fixed_insights.json
@@ -0,0 +1,22 @@
+[
+{
+    "Analysis": "Use early stopping, hyperparameter tuning, and cross-validation to avoid overfitting and improve robustness of the model.",
+    "Category": "Model Training",
+    "task_id": 4
+},
+{
+    "Analysis": "use k-fold bagging and early stopping",
+    "Category": "Model Training",
+    "task_id": 4
+},
+{
+    "Analysis": "To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained.",
+    "Category": "Model Training",
+    "task_id": 4
+},
+{
+    "Analysis": "Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.",
+    "Category": "Model Training",
+    "task_id": 4
+}
+]
\ No newline at end of file
diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py
index a800f4507..07e5fb655 100644
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@@ -1,4 +1,5 @@
 import json
+import os
 import random
 
 from expo.utils import clean_json_from_rsp, load_data_config, mcts_logger
@@ -68,8 +69,12 @@ class InstructionGenerator:
         return new_data
 
     @staticmethod
-    def load_analysis_pool(file_path, task_id=None):
+    def load_analysis_pool(file_path, use_fixed_insights, task_id=None):
         data = InstructionGenerator.load_json_data(file_path)
+        if use_fixed_insights:
+            current_directory = os.path.dirname(__file__)
+            fixed_insights = InstructionGenerator.load_json_data(f"{current_directory}/fixed_insights.json")
+            data.extend(fixed_insights)
         for item in data:
             if "task_id" not in item:
                 raise ValueError("task_id is not found in the analysis pool")
@@ -79,8 +84,12 @@ class InstructionGenerator:
         return data
 
     @staticmethod
-    async def generate_new_instructions(task_id, original_instruction, max_num, file_path, ext_info=None):
-        data = InstructionGenerator.load_analysis_pool(file_path, task_id)
+    async def generate_new_instructions(
+        task_id, original_instruction, max_num, file_path, ext_info=None, use_fixed_insights=False
+    ):
+        data = InstructionGenerator.load_analysis_pool(
+            file_path, task_id=task_id, use_fixed_insights=use_fixed_insights
+        )
         new_instructions = []
         if len(data) == 0:
             mcts_logger.log("MCTS", f"No insights available for task {task_id}")
diff --git a/expo/requirements.txt b/expo/requirements.txt
index 04de1a8bb..e85818bbe 100644
--- a/expo/requirements.txt
+++ b/expo/requirements.txt
@@ -3,3 +3,4 @@ openml==0.14.2
 # ml module to run in DI
 xgboost
 catboost
+lightgbm
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index 2123fade3..f1b5b2d80 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -28,11 +28,11 @@ def get_mcts_args(parser):
     parser.add_argument("--no_load_tree", dest="load_tree", action="store_false")
     parser.set_defaults(load_tree=False)
     parser.add_argument("--rollouts", type=int, default=5)
+    parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")
 
 
 def get_aug_exp_args(parser):
     parser.add_argument("--aug_mode", type=str, default="single", choices=["single", "set"])
-    parser.add_argument("--num_experiments", type=int, default=1)
 
 
 def get_di_args(parser):
@@ -41,6 +41,8 @@ def get_di_args(parser):
     parser.set_defaults(low_is_better=False)
     parser.add_argument("--reflection", dest="reflection", action="store_true")
     parser.add_argument("--no_reflection", dest="reflection", action="store_false")
+    parser.add_argument("--num_experiments", type=int, default=1)
+    parser.add_argument("--special_instruction", type=str, default=None, choices=["ag", "stacking"])
     parser.set_defaults(reflection=True)