1. add special instruction

2. add fixed insights
2026-06-11 15:15:18 +02:00 · 2024-09-14 15:17:42 +08:00 · 2024-09-14 15:17:42 +08:00 · 8beca0fadd
commit 8beca0fadd
parent 9c3adbe0ac
11 changed files with 111 additions and 47 deletions
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@ -15,18 +15,18 @@ from metagpt.tools.tool_recommend import ToolRecommender
 from metagpt.utils.common import read_json_file


-def initialize_di_root_node(task, data_config, low_is_better=False, reflection=True, name=""):
+def initialize_di_root_node(state, reflection: bool = True):
    start_task_id = 2
-    state = create_initial_state(
-        task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
-    )
+    # state = create_initial_state(
+    #     task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
+    # )
    role = ResearchAssistant(
        node_id="0", start_task_id=start_task_id, use_reflection=reflection, role_dir=state["node_dir"]
    )
    return role, Node(parent=None, state=state, action=None, value=0)


-def create_initial_state(task, start_task_id, data_config, low_is_better, name):
+def create_initial_state(task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str):
    initial_state = {
        "task": task,
        "work_dir": data_config["work_dir"],
@ -34,7 +34,9 @@ def create_initial_state(task, start_task_id, data_config, low_is_better, name):
        "dataset_config": data_config["datasets"][task],
        "datasets_dir": get_split_dataset_path(task, data_config),
        "exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"),
-        "requirement": generate_task_requirement(task, data_config),
+        "requirement": generate_task_requirement(
+            task, data_config, is_di=True, special_instruction=special_instruction
+        ),
        "has_run": False,
        "start_task_id": start_task_id,
        "low_is_better": low_is_better,
@ -157,6 +159,7 @@ class Node:
            original_instruction=original_instruction,
            max_num=max_children,
            file_path=self.state["exp_pool_path"],
+            use_fixed_insights=self.use_fixed_insights,
        )
        new_state = self.state.copy()
        new_state["start_task_id"] += 1
@ -234,9 +237,10 @@ class MCTS:
    c_explore: float = 1.4
    c_unvisited: float = 0.8

-    def __init__(self, root_node, max_depth):
+    def __init__(self, root_node, max_depth, use_fixed_insights):
        self.root_node = root_node
        self.max_depth = max_depth
+        self.use_fixed_insights = use_fixed_insights

    def select(self, node: Node):
        node = self.best_child()
@ -303,10 +307,8 @@ class MCTS:
    def get_num_simulations(self):
        return self.root_node.visited

-    async def search(self, task, data_config, name, rollouts, load_tree=False, low_is_better=False, reflection=False):
-        role, root = initialize_di_root_node(
-            task, data_config, low_is_better=low_is_better, reflection=reflection, name=name
-        )
+    async def search(self, state, rollouts, load_tree=False, reflection=False):
+        role, root = initialize_di_root_node(state, reflection=reflection)
        self.root_node = root
        tree_loaded = False
        if load_tree:
--- a/expo/README.md
+++ b/expo/README.md
@ -187,16 +187,10 @@ ### Base DI
 For setup, check 5.

 - `python run_experiment.py --exp_mode base --task titanic --num_experiments 10`
+- Ask DI to use AutoGluon: `--special_instruction ag`
+- Ask DI to use the stacking ensemble method: `--special_instruction stacking`


-### DI RandomSearch
-For setup, check 5.
-
- Single insight
-`python run_experiment.py --exp_mode aug --task titanic --aug_mode single`
-
- Set insight
-`python run_experiment.py --exp_mode aug --task titanic --aug_mode set`


 ## 5. DI MCTS
@ -223,6 +217,20 @@ #### Run
 - `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 10 --low_is_better`


+In addition to the generated insights, include the fixed insights saved in `insights/fixed_insights.json`
+- `--use_fixed_insights`
+  
+
+
+#### Ablation Study
+
+**DI RandomSearch**
+
+- Single insight
+`python run_experiment.py --exp_mode aug --task titanic --aug_mode single`
+
+- Set insight
+`python run_experiment.py --exp_mode aug --task titanic --aug_mode set`



--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@ -10,16 +10,27 @@ from sklearn.model_selection import train_test_split

 from expo.insights.solution_designer import SolutionDesigner

-BASE_USER_REQUIREMENT = """\
+BASE_USER_REQUIREMENT = """
 This is a {datasetname} dataset. Your goal is to predict the target column `{target_col}`.
 Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. 
 Report {metric} on the eval data. Do not plot or make any visualizations.
 """

-RECOMMENDATION = """\
+USE_AG = """
+7. Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.
+"""
+
+STACKING = """
+7. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor.
+8. You could do some quick model prototyping to see which models work best and then use them in the ensemble. 
+"""
+
+SPECIAL_INSTRUCTIONS = {"ag": USE_AG, "stacking": STACKING}
+
+RECOMMENDATION = """
 ## Base Models and Ensemble
 You can consider using the following base models:
-’GBM’ (LightGBM) ‘CAT’ (CatBoost) ‘XGB’ (XGBoost) ‘RF’ (random forest) ‘XT’ (extremely randomized trees) ‘KNN’ (k-nearest neighbors) ‘LR’ (linear regression)
+`GBM` (LightGBM) `CAT` (CatBoost) `XGB` (XGBoost) `RF` (random forest) `XT` (extremely randomized trees) `KNN` (k-nearest neighbors) ‘LR’ (linear regression)
 """

 DI_INSTRUCTION = (
@ -27,11 +38,10 @@ DI_INSTRUCTION = (
    + """**Attention** 
 1. Please do not leak the target label in any form during training.
 2. Test set does not have the target column.
-3. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
-4. When scaling or transforming features, make sure the target column is not included.
-5. You could utilize dev set to validate and improve model training.
-6. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained
-7. Make sure the model prototyping is fast. 
+3. When conducting data exploration or analysis, print out the results of your findings.
+4. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
+5. When scaling or transforming features, make sure the target column is not included.
+6. You could utilize dev set to validate and improve model training. {special_instruction}

 ## Saving Dev and Test Predictions
 1. Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. 
@ -46,7 +56,7 @@ Print the train and dev set performance in the last step.
 """
 )

-TASK_PROMPT = """\
+TASK_PROMPT = """
 # User requirement
 {user_requirement}
 {additional_instruction}
@ -142,12 +152,18 @@ def create_dataset_dict(dataset):
    return dataset_dict


-def generate_di_instruction(output_dir):
-    additional_instruction = DI_INSTRUCTION.format(output_dir=output_dir)
+def generate_di_instruction(output_dir, special_instruction):
+    if special_instruction:
+        special_instruction_prompt = SPECIAL_INSTRUCTIONS[special_instruction]
+    else:
+        special_instruction_prompt = ""
+    additional_instruction = DI_INSTRUCTION.format(
+        output_dir=output_dir, special_instruction=special_instruction_prompt
+    )
    return additional_instruction


-def generate_task_requirement(task_name, data_config, is_di=True):
+def generate_task_requirement(task_name, data_config, is_di=True, special_instruction=None):
    user_requirement = get_user_requirement(task_name, data_config)
    split_dataset_path = get_split_dataset_path(task_name, data_config)
    train_path = split_dataset_path["train"]
@ -158,7 +174,7 @@ def generate_task_requirement(task_name, data_config, is_di=True):
    datasets_dir = data_config["datasets_dir"]
    data_info_path = f"{datasets_dir}/{task_name}/dataset_info.json"
    if is_di:
-        additional_instruction = generate_di_instruction(output_dir)
+        additional_instruction = generate_di_instruction(output_dir, special_instruction)
    else:
        additional_instruction = ""
    user_requirement = TASK_PROMPT.format(
--- a/expo/experimenter/aug.py
+++ b/expo/experimenter/aug.py
@ -17,7 +17,9 @@ class AugExperimenter(Experimenter):
        # state = create_initial_state(self.args.task, start_task_id=1, data_config=self.data_config, low_is_better=self.args.low_is_better, name="")
        user_requirement = self.state["requirement"]
        exp_pool_path = get_exp_pool_path(self.args.task, self.data_config, pool_name="ds_analysis_pool")
-        exp_pool = InstructionGenerator.load_analysis_pool(exp_pool_path)
+        exp_pool = InstructionGenerator.load_analysis_pool(
+            exp_pool_path, use_fixed_insights=self.args.use_fixed_insights
+        )
        if self.args.aug_mode == "single":
            exps = InstructionGenerator._random_sample(exp_pool, self.args.num_experiments)
            exps = [exp["Analysis"] for exp in exps]
--- a/expo/experimenter/custom.py
+++ b/expo/experimenter/custom.py
@ -18,7 +18,12 @@ class CustomExperimenter(Experimenter):
        self.name = kwargs.get("name", "")
        self.result_path = f"results/custom_{self.name}"
        self.state = create_initial_state(
-            self.task, start_task_id=1, data_config=self.data_config, low_is_better=self.low_is_better, name=self.name
+            self.task,
+            start_task_id=1,
+            data_config=self.data_config,
+            low_is_better=self.low_is_better,
+            name=self.name,
+            special_instruction=self.args.special_instruction,
        )

    def run_experiment(self):
--- a/expo/experimenter/experimenter.py
+++ b/expo/experimenter/experimenter.py
@ -23,7 +23,8 @@ class Experimenter:
            start_task_id=1,
            data_config=self.data_config,
            low_is_better=self.args.low_is_better,
-            name="",
+            name=self.args.name,
+            special_instruction=self.args.special_instruction,
        )

    async def run_di(self, di, user_requirement, run_idx):
--- a/expo/experimenter/mcts.py
+++ b/expo/experimenter/mcts.py
@ -13,19 +13,15 @@ class MCTSExperimenter(Experimenter):

    async def run_experiment(self):
        if self.tree_mode == "greedy":
-            mcts = Greedy(root_node=None, max_depth=5)
+            mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
        elif self.tree_mode == "random":
-            mcts = Random(root_node=None, max_depth=5)
+            mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
        else:
-            mcts = MCTS(root_node=None, max_depth=5)
+            mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
        best_nodes = await mcts.search(
-            self.args.task,
-            self.data_config,
-            low_is_better=self.args.low_is_better,
-            load_tree=self.args.load_tree,
+            state=self.state,
            reflection=self.args.reflection,
            rollouts=self.args.rollouts,
-            name=self.args.name,
        )
        best_node = best_nodes["global_best"]
        dev_best_node = best_nodes["dev_best"]
--- a/expo/insights/fixed_insights.json
+++ b/expo/insights/fixed_insights.json
@ -0,0 +1,22 @@
+[
+{
+    "Analysis": "Use early stopping, hyperparameter tuning, and cross-validation to avoid overfitting and improve robustness of the model.",
+    "Category": "Model Training",
+    "task_id": 4
+},
+{
+    "Analysis": "use k-fold bagging and early stopping",
+    "Category": "Model Training",
+    "task_id": 4
+},
+{
+    "Analysis": "To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained.",
+    "Category": "Model Training",
+    "task_id": 4
+},
+{
+    "Analysis": "Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.",
+    "Category": "Model Training",
+    "task_id": 4
+}
+]
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@ -1,4 +1,5 @@
 import json
+import os
 import random

 from expo.utils import clean_json_from_rsp, load_data_config, mcts_logger
@ -68,8 +69,12 @@ class InstructionGenerator:
        return new_data

    @staticmethod
-    def load_analysis_pool(file_path, task_id=None):
+    def load_analysis_pool(file_path, use_fixed_insights, task_id=None):
        data = InstructionGenerator.load_json_data(file_path)
+        if use_fixed_insights:
+            current_directory = os.path.dirname(__file__)
+            fixed_insights = InstructionGenerator.load_json_data(f"{current_directory}/fixed_insights.json")
+            data.extend(fixed_insights)
        for item in data:
            if "task_id" not in item:
                raise ValueError("task_id is not found in the analysis pool")
@ -79,8 +84,12 @@ class InstructionGenerator:
        return data

    @staticmethod
-    async def generate_new_instructions(task_id, original_instruction, max_num, file_path, ext_info=None):
-        data = InstructionGenerator.load_analysis_pool(file_path, task_id)
+    async def generate_new_instructions(
+        task_id, original_instruction, max_num, file_path, ext_info=None, use_fixed_insights=False
+    ):
+        data = InstructionGenerator.load_analysis_pool(
+            file_path, task_id=task_id, use_fixed_insights=use_fixed_insights
+        )
        new_instructions = []
        if len(data) == 0:
            mcts_logger.log("MCTS", f"No insights available for task {task_id}")
--- a/expo/requirements.txt
+++ b/expo/requirements.txt
@ -3,3 +3,4 @@ openml==0.14.2
 # ml module to run in DI
 xgboost
 catboost
+lightgbm
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@ -28,11 +28,11 @@ def get_mcts_args(parser):
    parser.add_argument("--no_load_tree", dest="load_tree", action="store_false")
    parser.set_defaults(load_tree=False)
    parser.add_argument("--rollouts", type=int, default=5)
+    parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")


 def get_aug_exp_args(parser):
    parser.add_argument("--aug_mode", type=str, default="single", choices=["single", "set"])
-    parser.add_argument("--num_experiments", type=int, default=1)


 def get_di_args(parser):
@ -41,6 +41,8 @@ def get_di_args(parser):
    parser.set_defaults(low_is_better=False)
    parser.add_argument("--reflection", dest="reflection", action="store_true")
    parser.add_argument("--no_reflection", dest="reflection", action="store_false")
+    parser.add_argument("--num_experiments", type=int, default=1)
+    parser.add_argument("--special_instruction", type=str, default=None, choices=["ag", "stacking"])
    parser.set_defaults(reflection=True)