From 8beca0faddd33b981d23d875c1a59df0b71947f0 Mon Sep 17 00:00:00 2001 From: Yizhou Chi Date: Sat, 14 Sep 2024 15:17:42 +0800 Subject: [PATCH] 1. add special instruction 2. add fixed insights --- expo/MCTS.py | 24 ++++++++------- expo/README.md | 24 ++++++++++----- expo/data/dataset.py | 42 ++++++++++++++++++-------- expo/experimenter/aug.py | 4 ++- expo/experimenter/custom.py | 7 ++++- expo/experimenter/experimenter.py | 3 +- expo/experimenter/mcts.py | 12 +++----- expo/insights/fixed_insights.json | 22 ++++++++++++++ expo/insights/instruction_generator.py | 15 +++++++-- expo/requirements.txt | 1 + expo/run_experiment.py | 4 ++- 11 files changed, 111 insertions(+), 47 deletions(-) create mode 100644 expo/insights/fixed_insights.json diff --git a/expo/MCTS.py b/expo/MCTS.py index 360baac8d..265356f65 100644 --- a/expo/MCTS.py +++ b/expo/MCTS.py @@ -15,18 +15,18 @@ from metagpt.tools.tool_recommend import ToolRecommender from metagpt.utils.common import read_json_file -def initialize_di_root_node(task, data_config, low_is_better=False, reflection=True, name=""): +def initialize_di_root_node(state, reflection: bool = True): start_task_id = 2 - state = create_initial_state( - task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name - ) + # state = create_initial_state( + # task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name + # ) role = ResearchAssistant( node_id="0", start_task_id=start_task_id, use_reflection=reflection, role_dir=state["node_dir"] ) return role, Node(parent=None, state=state, action=None, value=0) -def create_initial_state(task, start_task_id, data_config, low_is_better, name): +def create_initial_state(task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str): initial_state = { "task": task, "work_dir": data_config["work_dir"], @@ -34,7 +34,9 @@ def create_initial_state(task, start_task_id, data_config, low_is_better, name): "dataset_config": data_config["datasets"][task], "datasets_dir": get_split_dataset_path(task, data_config), "exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"), - "requirement": generate_task_requirement(task, data_config), + "requirement": generate_task_requirement( + task, data_config, is_di=True, special_instruction=special_instruction + ), "has_run": False, "start_task_id": start_task_id, "low_is_better": low_is_better, @@ -157,6 +159,7 @@ class Node: original_instruction=original_instruction, max_num=max_children, file_path=self.state["exp_pool_path"], + use_fixed_insights=self.use_fixed_insights, ) new_state = self.state.copy() new_state["start_task_id"] += 1 @@ -234,9 +237,10 @@ class MCTS: c_explore: float = 1.4 c_unvisited: float = 0.8 - def __init__(self, root_node, max_depth): + def __init__(self, root_node, max_depth, use_fixed_insights): self.root_node = root_node self.max_depth = max_depth + self.use_fixed_insights = use_fixed_insights def select(self, node: Node): node = self.best_child() @@ -303,10 +307,8 @@ class MCTS: def get_num_simulations(self): return self.root_node.visited - async def search(self, task, data_config, name, rollouts, load_tree=False, low_is_better=False, reflection=False): - role, root = initialize_di_root_node( - task, data_config, low_is_better=low_is_better, reflection=reflection, name=name - ) + async def search(self, state, rollouts, load_tree=False, reflection=False): + role, root = initialize_di_root_node(state, reflection=reflection) self.root_node = root tree_loaded = False if load_tree: diff --git a/expo/README.md b/expo/README.md index 55ea7eed4..00d1cae50 100644 --- a/expo/README.md +++ b/expo/README.md @@ -187,16 +187,10 @@ ### Base DI For setup, check 5. - `python run_experiment.py --exp_mode base --task titanic --num_experiments 10` +- Ask DI to use AutoGluon: `--special_instruction ag` +- Ask DI to use the stacking ensemble method: `--special_instruction stacking` -### DI RandomSearch -For setup, check 5. - -- Single insight -`python run_experiment.py --exp_mode aug --task titanic --aug_mode single` - -- Set insight -`python run_experiment.py --exp_mode aug --task titanic --aug_mode set` ## 5. DI MCTS @@ -223,6 +217,20 @@ #### Run - `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 10 --low_is_better` +In addition to the generated insights, include the fixed insights saved in `insights/fixed_insights.json` +- `--use_fixed_insights` + + + +#### Ablation Study + +**DI RandomSearch** + +- Single insight +`python run_experiment.py --exp_mode aug --task titanic --aug_mode single` + +- Set insight +`python run_experiment.py --exp_mode aug --task titanic --aug_mode set` diff --git a/expo/data/dataset.py b/expo/data/dataset.py index d2ec48326..03b80985a 100644 --- a/expo/data/dataset.py +++ b/expo/data/dataset.py @@ -10,16 +10,27 @@ from sklearn.model_selection import train_test_split from expo.insights.solution_designer import SolutionDesigner -BASE_USER_REQUIREMENT = """\ +BASE_USER_REQUIREMENT = """ This is a {datasetname} dataset. Your goal is to predict the target column `{target_col}`. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report {metric} on the eval data. Do not plot or make any visualizations. """ -RECOMMENDATION = """\ +USE_AG = """ +7. Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric. +""" + +STACKING = """ +7. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor. +8. You could do some quick model prototyping to see which models work best and then use them in the ensemble. +""" + +SPECIAL_INSTRUCTIONS = {"ag": USE_AG, "stacking": STACKING} + +RECOMMENDATION = """ ## Base Models and Ensemble You can consider using the following base models: -’GBM’ (LightGBM) ‘CAT’ (CatBoost) ‘XGB’ (XGBoost) ‘RF’ (random forest) ‘XT’ (extremely randomized trees) ‘KNN’ (k-nearest neighbors) ‘LR’ (linear regression) +`GBM` (LightGBM) `CAT` (CatBoost) `XGB` (XGBoost) `RF` (random forest) `XT` (extremely randomized trees) `KNN` (k-nearest neighbors) ‘LR’ (linear regression) """ DI_INSTRUCTION = ( @@ -27,11 +38,10 @@ DI_INSTRUCTION = ( + """**Attention** 1. Please do not leak the target label in any form during training. 2. Test set does not have the target column. -3. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition). -4. When scaling or transforming features, make sure the target column is not included. -5. You could utilize dev set to validate and improve model training. -6. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained -7. Make sure the model prototyping is fast. +3. When conducting data exploration or analysis, print out the results of your findings. +4. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition). +5. When scaling or transforming features, make sure the target column is not included. +6. You could utilize dev set to validate and improve model training. {special_instruction} ## Saving Dev and Test Predictions 1. Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. @@ -46,7 +56,7 @@ Print the train and dev set performance in the last step. """ ) -TASK_PROMPT = """\ +TASK_PROMPT = """ # User requirement {user_requirement} {additional_instruction} @@ -142,12 +152,18 @@ def create_dataset_dict(dataset): return dataset_dict -def generate_di_instruction(output_dir): - additional_instruction = DI_INSTRUCTION.format(output_dir=output_dir) +def generate_di_instruction(output_dir, special_instruction): + if special_instruction: + special_instruction_prompt = SPECIAL_INSTRUCTIONS[special_instruction] + else: + special_instruction_prompt = "" + additional_instruction = DI_INSTRUCTION.format( + output_dir=output_dir, special_instruction=special_instruction_prompt + ) return additional_instruction -def generate_task_requirement(task_name, data_config, is_di=True): +def generate_task_requirement(task_name, data_config, is_di=True, special_instruction=None): user_requirement = get_user_requirement(task_name, data_config) split_dataset_path = get_split_dataset_path(task_name, data_config) train_path = split_dataset_path["train"] @@ -158,7 +174,7 @@ def generate_task_requirement(task_name, data_config, is_di=True): datasets_dir = data_config["datasets_dir"] data_info_path = f"{datasets_dir}/{task_name}/dataset_info.json" if is_di: - additional_instruction = generate_di_instruction(output_dir) + additional_instruction = generate_di_instruction(output_dir, special_instruction) else: additional_instruction = "" user_requirement = TASK_PROMPT.format( diff --git a/expo/experimenter/aug.py b/expo/experimenter/aug.py index 8312f57fc..e57d024bd 100644 --- a/expo/experimenter/aug.py +++ b/expo/experimenter/aug.py @@ -17,7 +17,9 @@ class AugExperimenter(Experimenter): # state = create_initial_state(self.args.task, start_task_id=1, data_config=self.data_config, low_is_better=self.args.low_is_better, name="") user_requirement = self.state["requirement"] exp_pool_path = get_exp_pool_path(self.args.task, self.data_config, pool_name="ds_analysis_pool") - exp_pool = InstructionGenerator.load_analysis_pool(exp_pool_path) + exp_pool = InstructionGenerator.load_analysis_pool( + exp_pool_path, use_fixed_insights=self.args.use_fixed_insights + ) if self.args.aug_mode == "single": exps = InstructionGenerator._random_sample(exp_pool, self.args.num_experiments) exps = [exp["Analysis"] for exp in exps] diff --git a/expo/experimenter/custom.py b/expo/experimenter/custom.py index df090fb58..92b7dafa2 100644 --- a/expo/experimenter/custom.py +++ b/expo/experimenter/custom.py @@ -18,7 +18,12 @@ class CustomExperimenter(Experimenter): self.name = kwargs.get("name", "") self.result_path = f"results/custom_{self.name}" self.state = create_initial_state( - self.task, start_task_id=1, data_config=self.data_config, low_is_better=self.low_is_better, name=self.name + self.task, + start_task_id=1, + data_config=self.data_config, + low_is_better=self.low_is_better, + name=self.name, + special_instruction=self.args.special_instruction, ) def run_experiment(self): diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py index 418e0089a..89d589d7d 100644 --- a/expo/experimenter/experimenter.py +++ b/expo/experimenter/experimenter.py @@ -23,7 +23,8 @@ class Experimenter: start_task_id=1, data_config=self.data_config, low_is_better=self.args.low_is_better, - name="", + name=self.args.name, + special_instruction=self.args.special_instruction, ) async def run_di(self, di, user_requirement, run_idx): diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py index fbe2f35f1..e06169a70 100644 --- a/expo/experimenter/mcts.py +++ b/expo/experimenter/mcts.py @@ -13,19 +13,15 @@ class MCTSExperimenter(Experimenter): async def run_experiment(self): if self.tree_mode == "greedy": - mcts = Greedy(root_node=None, max_depth=5) + mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights) elif self.tree_mode == "random": - mcts = Random(root_node=None, max_depth=5) + mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights) else: - mcts = MCTS(root_node=None, max_depth=5) + mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights) best_nodes = await mcts.search( - self.args.task, - self.data_config, - low_is_better=self.args.low_is_better, - load_tree=self.args.load_tree, + state=self.state, reflection=self.args.reflection, rollouts=self.args.rollouts, - name=self.args.name, ) best_node = best_nodes["global_best"] dev_best_node = best_nodes["dev_best"] diff --git a/expo/insights/fixed_insights.json b/expo/insights/fixed_insights.json new file mode 100644 index 000000000..e52745707 --- /dev/null +++ b/expo/insights/fixed_insights.json @@ -0,0 +1,22 @@ +[ +{ + "Analysis": "Use early stopping, hyperparameter tuning, and cross-validation to avoid overfitting and improve robustness of the model.", + "Category": "Model Training", + "task_id": 4 +}, +{ + "Analysis": "use k-fold bagging and early stopping", + "Category": "Model Training", + "task_id": 4 +}, +{ + "Analysis": "To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained.", + "Category": "Model Training", + "task_id": 4 +}, +{ + "Analysis": "Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.", + "Category": "Model Training", + "task_id": 4 +} +] \ No newline at end of file diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py index a800f4507..07e5fb655 100644 --- a/expo/insights/instruction_generator.py +++ b/expo/insights/instruction_generator.py @@ -1,4 +1,5 @@ import json +import os import random from expo.utils import clean_json_from_rsp, load_data_config, mcts_logger @@ -68,8 +69,12 @@ class InstructionGenerator: return new_data @staticmethod - def load_analysis_pool(file_path, task_id=None): + def load_analysis_pool(file_path, use_fixed_insights, task_id=None): data = InstructionGenerator.load_json_data(file_path) + if use_fixed_insights: + current_directory = os.path.dirname(__file__) + fixed_insights = InstructionGenerator.load_json_data(f"{current_directory}/fixed_insights.json") + data.extend(fixed_insights) for item in data: if "task_id" not in item: raise ValueError("task_id is not found in the analysis pool") @@ -79,8 +84,12 @@ class InstructionGenerator: return data @staticmethod - async def generate_new_instructions(task_id, original_instruction, max_num, file_path, ext_info=None): - data = InstructionGenerator.load_analysis_pool(file_path, task_id) + async def generate_new_instructions( + task_id, original_instruction, max_num, file_path, ext_info=None, use_fixed_insights=False + ): + data = InstructionGenerator.load_analysis_pool( + file_path, task_id=task_id, use_fixed_insights=use_fixed_insights + ) new_instructions = [] if len(data) == 0: mcts_logger.log("MCTS", f"No insights available for task {task_id}") diff --git a/expo/requirements.txt b/expo/requirements.txt index 04de1a8bb..e85818bbe 100644 --- a/expo/requirements.txt +++ b/expo/requirements.txt @@ -3,3 +3,4 @@ openml==0.14.2 # ml module to run in DI xgboost catboost +lightgbm diff --git a/expo/run_experiment.py b/expo/run_experiment.py index 2123fade3..f1b5b2d80 100644 --- a/expo/run_experiment.py +++ b/expo/run_experiment.py @@ -28,11 +28,11 @@ def get_mcts_args(parser): parser.add_argument("--no_load_tree", dest="load_tree", action="store_false") parser.set_defaults(load_tree=False) parser.add_argument("--rollouts", type=int, default=5) + parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true") def get_aug_exp_args(parser): parser.add_argument("--aug_mode", type=str, default="single", choices=["single", "set"]) - parser.add_argument("--num_experiments", type=int, default=1) def get_di_args(parser): @@ -41,6 +41,8 @@ def get_di_args(parser): parser.set_defaults(low_is_better=False) parser.add_argument("--reflection", dest="reflection", action="store_true") parser.add_argument("--no_reflection", dest="reflection", action="store_false") + parser.add_argument("--num_experiments", type=int, default=1) + parser.add_argument("--special_instruction", type=str, default=None, choices=["ag", "stacking"]) parser.set_defaults(reflection=True)