diff --git a/expo/MCTS.py b/expo/MCTS.py index 360baac8d..228671e2c 100644 --- a/expo/MCTS.py +++ b/expo/MCTS.py @@ -15,18 +15,17 @@ from metagpt.tools.tool_recommend import ToolRecommender from metagpt.utils.common import read_json_file -def initialize_di_root_node(task, data_config, low_is_better=False, reflection=True, name=""): - start_task_id = 2 - state = create_initial_state( - task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name - ) +def initialize_di_root_node(state, reflection: bool = True): + # state = create_initial_state( + # task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name + # ) role = ResearchAssistant( - node_id="0", start_task_id=start_task_id, use_reflection=reflection, role_dir=state["node_dir"] + node_id="0", start_task_id=state["start_task_id"], use_reflection=reflection, role_dir=state["node_dir"] ) return role, Node(parent=None, state=state, action=None, value=0) -def create_initial_state(task, start_task_id, data_config, low_is_better, name): +def create_initial_state(task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str): initial_state = { "task": task, "work_dir": data_config["work_dir"], @@ -34,11 +33,14 @@ def create_initial_state(task, start_task_id, data_config, low_is_better, name): "dataset_config": data_config["datasets"][task], "datasets_dir": get_split_dataset_path(task, data_config), "exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"), - "requirement": generate_task_requirement(task, data_config), + "requirement": generate_task_requirement( + task, data_config, is_di=True, special_instruction=special_instruction + ), "has_run": False, "start_task_id": start_task_id, "low_is_better": low_is_better, } + os.makedirs(initial_state["node_dir"], exist_ok=True) return initial_state @@ -146,7 +148,7 @@ class Node: role = role.model_copy() role.save_state(static_save=True) - async def expand(self, max_children): + async def expand(self, max_children, use_fixed_insights): if self.is_fully_expanded(): return insight_geneartor = InstructionGenerator() @@ -157,6 +159,7 @@ class Node: original_instruction=original_instruction, max_num=max_children, file_path=self.state["exp_pool_path"], + use_fixed_insights=use_fixed_insights, ) new_state = self.state.copy() new_state["start_task_id"] += 1 @@ -205,6 +208,7 @@ class Node: self.raw_reward = score_dict run_finished = True except Exception as e: + print(f"Error: {e}") mcts_logger.log("MCTS", f"Error in running the role: {e}") num_runs += 1 if not run_finished: @@ -234,9 +238,10 @@ class MCTS: c_explore: float = 1.4 c_unvisited: float = 0.8 - def __init__(self, root_node, max_depth): + def __init__(self, root_node, max_depth, use_fixed_insights): self.root_node = root_node self.max_depth = max_depth + self.use_fixed_insights = use_fixed_insights def select(self, node: Node): node = self.best_child() @@ -255,7 +260,7 @@ class MCTS: return max(all_children, key=uct) async def expand(self, node: Node, max_children=5): - await node.expand(max_children) + await node.expand(max_children, self.use_fixed_insights) if node not in self.children or not self.children[node]: self.children[node] = node.children return node.children @@ -303,10 +308,8 @@ class MCTS: def get_num_simulations(self): return self.root_node.visited - async def search(self, task, data_config, name, rollouts, load_tree=False, low_is_better=False, reflection=False): - role, root = initialize_di_root_node( - task, data_config, low_is_better=low_is_better, reflection=reflection, name=name - ) + async def search(self, state, rollouts, load_tree=False, reflection=False): + role, root = initialize_di_root_node(state, reflection=reflection) self.root_node = root tree_loaded = False if load_tree: diff --git a/expo/README.md b/expo/README.md index 4dc1341e6..a8a151111 100644 --- a/expo/README.md +++ b/expo/README.md @@ -223,16 +223,10 @@ ### Base DI For setup, check 5. - `python run_experiment.py --exp_mode base --task titanic --num_experiments 10` +- Ask DI to use AutoGluon: `--special_instruction ag` +- Ask DI to use the stacking ensemble method: `--special_instruction stacking` -### DI RandomSearch -For setup, check 5. - -- Single insight -`python run_experiment.py --exp_mode aug --task titanic --aug_mode single` - -- Set insight -`python run_experiment.py --exp_mode aug --task titanic --aug_mode set` ## 5. DI MCTS @@ -259,6 +253,20 @@ #### Run - `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 10 --low_is_better` +In addition to the generated insights, include the fixed insights saved in `expo/insights/fixed_insights.json` +- `--use_fixed_insights` + + + +#### Ablation Study + +**DI RandomSearch** + +- Single insight +`python run_experiment.py --exp_mode aug --task titanic --aug_mode single` + +- Set insight +`python run_experiment.py --exp_mode aug --task titanic --aug_mode set` diff --git a/expo/data/dataset.py b/expo/data/dataset.py index 3b2017d1a..28bd26d2e 100644 --- a/expo/data/dataset.py +++ b/expo/data/dataset.py @@ -10,26 +10,37 @@ from sklearn.model_selection import train_test_split from expo.insights.solution_designer import SolutionDesigner -BASE_USER_REQUIREMENT = """\ +BASE_USER_REQUIREMENT = """ This is a {datasetname} dataset. Your goal is to predict the target column `{target_col}`. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report {metric} on the eval data. Do not plot or make any visualizations. """ +USE_AG = """ +7. Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric. +""" -DI_INSTRUCTION = """\ -**Attention** +STACKING = """ +7. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor. +8. You could do some quick model prototyping to see which models work best and then use them in the ensemble. +""" + +SPECIAL_INSTRUCTIONS = {"ag": USE_AG, "stacking": STACKING} + +DI_INSTRUCTION = """ +## Attention 1. Please do not leak the target label in any form during training. 2. Test set does not have the target column. -3. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition). -4. If labels are transformed during training, they should be transformed back to the original format before saving the predictions. -5. You could utilize dev set to validate and improve model training. -6. Use techniques to avoid overfitting. +3. When conducting data exploration or analysis, print out the results of your findings. +4. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition). +5. When scaling or transforming features, make sure the target column is not included. +6. You could utilize dev set to validate and improve model training. {special_instruction} ## Saving Dev and Test Predictions 1. Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. - Both files should contain a single column named `target` with the predicted values. 2. Make sure the prediction results are in the same format as the target column in the training set. +- For instance, if the target column is categorical, the prediction results should be categorical as well. ## Output Performance Print the train and dev set performance in the last step. @@ -38,7 +49,7 @@ Print the train and dev set performance in the last step. {output_dir} """ -TASK_PROMPT = """\ +TASK_PROMPT = """ # User requirement {user_requirement} {additional_instruction} @@ -134,12 +145,18 @@ def create_dataset_dict(dataset): return dataset_dict -def generate_di_instruction(output_dir): - additional_instruction = DI_INSTRUCTION.format(output_dir=output_dir) +def generate_di_instruction(output_dir, special_instruction): + if special_instruction: + special_instruction_prompt = SPECIAL_INSTRUCTIONS[special_instruction] + else: + special_instruction_prompt = "" + additional_instruction = DI_INSTRUCTION.format( + output_dir=output_dir, special_instruction=special_instruction_prompt + ) return additional_instruction -def generate_task_requirement(task_name, data_config, is_di=True): +def generate_task_requirement(task_name, data_config, is_di=True, special_instruction=None): user_requirement = get_user_requirement(task_name, data_config) split_dataset_path = get_split_dataset_path(task_name, data_config) train_path = split_dataset_path["train"] @@ -150,7 +167,7 @@ def generate_task_requirement(task_name, data_config, is_di=True): datasets_dir = data_config["datasets_dir"] data_info_path = f"{datasets_dir}/{task_name}/dataset_info.json" if is_di: - additional_instruction = generate_di_instruction(output_dir) + additional_instruction = generate_di_instruction(output_dir, special_instruction) else: additional_instruction = "" user_requirement = TASK_PROMPT.format( diff --git a/expo/experimenter/aug.py b/expo/experimenter/aug.py index 8312f57fc..e57d024bd 100644 --- a/expo/experimenter/aug.py +++ b/expo/experimenter/aug.py @@ -17,7 +17,9 @@ class AugExperimenter(Experimenter): # state = create_initial_state(self.args.task, start_task_id=1, data_config=self.data_config, low_is_better=self.args.low_is_better, name="") user_requirement = self.state["requirement"] exp_pool_path = get_exp_pool_path(self.args.task, self.data_config, pool_name="ds_analysis_pool") - exp_pool = InstructionGenerator.load_analysis_pool(exp_pool_path) + exp_pool = InstructionGenerator.load_analysis_pool( + exp_pool_path, use_fixed_insights=self.args.use_fixed_insights + ) if self.args.aug_mode == "single": exps = InstructionGenerator._random_sample(exp_pool, self.args.num_experiments) exps = [exp["Analysis"] for exp in exps] diff --git a/expo/experimenter/custom.py b/expo/experimenter/custom.py index df090fb58..92b7dafa2 100644 --- a/expo/experimenter/custom.py +++ b/expo/experimenter/custom.py @@ -18,7 +18,12 @@ class CustomExperimenter(Experimenter): self.name = kwargs.get("name", "") self.result_path = f"results/custom_{self.name}" self.state = create_initial_state( - self.task, start_task_id=1, data_config=self.data_config, low_is_better=self.low_is_better, name=self.name + self.task, + start_task_id=1, + data_config=self.data_config, + low_is_better=self.low_is_better, + name=self.name, + special_instruction=self.args.special_instruction, ) def run_experiment(self): diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py index 418e0089a..155108f8d 100644 --- a/expo/experimenter/experimenter.py +++ b/expo/experimenter/experimenter.py @@ -13,6 +13,7 @@ from expo.utils import DATA_CONFIG, save_notebook class Experimenter: result_path: str = "results/base" data_config = DATA_CONFIG + start_task_id = 1 def __init__(self, args, **kwargs): self.args = args @@ -20,10 +21,11 @@ class Experimenter: self.start_time = self.start_time_raw.strftime("%Y%m%d%H%M") self.state = create_initial_state( self.args.task, - start_task_id=1, + start_task_id=self.start_task_id, data_config=self.data_config, low_is_better=self.args.low_is_better, - name="", + name=self.args.name, + special_instruction=self.args.special_instruction, ) async def run_di(self, di, user_requirement, run_idx): @@ -86,7 +88,7 @@ class Experimenter: pred_node_path = os.path.join(state["node_dir"], f"{self.start_time}-{split}_predictions.csv") gt_path = os.path.join(state["datasets_dir"][f"{split}_target"]) preds = pd.read_csv(pred_path) - preds = preds[preds.columns.tolist()[0]] + preds = preds[preds.columns.tolist()[-1]] preds.to_csv(pred_node_path, index=False) gt = pd.read_csv(gt_path)["target"] metric = state["dataset_config"]["metric"] diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py index fbe2f35f1..89f362b6b 100644 --- a/expo/experimenter/mcts.py +++ b/expo/experimenter/mcts.py @@ -6,6 +6,7 @@ from expo.MCTS import MCTS class MCTSExperimenter(Experimenter): result_path: str = "results/mcts" + start_task_id = 2 def __init__(self, args, tree_mode=None, **kwargs): super().__init__(args, **kwargs) @@ -13,19 +14,16 @@ class MCTSExperimenter(Experimenter): async def run_experiment(self): if self.tree_mode == "greedy": - mcts = Greedy(root_node=None, max_depth=5) + mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights) elif self.tree_mode == "random": - mcts = Random(root_node=None, max_depth=5) + mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights) else: - mcts = MCTS(root_node=None, max_depth=5) + mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights) best_nodes = await mcts.search( - self.args.task, - self.data_config, - low_is_better=self.args.low_is_better, - load_tree=self.args.load_tree, + state=self.state, reflection=self.args.reflection, rollouts=self.args.rollouts, - name=self.args.name, + load_tree=self.args.load_tree, ) best_node = best_nodes["global_best"] dev_best_node = best_nodes["dev_best"] diff --git a/expo/insights/fixed_insights.json b/expo/insights/fixed_insights.json new file mode 100644 index 000000000..4f42b9db1 --- /dev/null +++ b/expo/insights/fixed_insights.json @@ -0,0 +1,22 @@ +[ +{ + "Analysis": "Use early stopping, hyperparameter tuning, and cross-validation to avoid overfitting and improve robustness of the model.", + "Category": "Model Training", + "task_id": 4 +}, +{ + "Analysis": "use k-fold bagging and early stopping", + "Category": "Model Training", + "task_id": 4 +}, +{ + "Analysis": "To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor; You could do some quick model prototyping to see which models work best and then use them in the ensemble.", + "Category": "Model Training", + "task_id": 4 +}, +{ + "Analysis": "Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.", + "Category": "Model Training", + "task_id": 4 +} +] \ No newline at end of file diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py index c9ff7ec6e..07e5fb655 100644 --- a/expo/insights/instruction_generator.py +++ b/expo/insights/instruction_generator.py @@ -1,4 +1,5 @@ import json +import os import random from expo.utils import clean_json_from_rsp, load_data_config, mcts_logger @@ -68,8 +69,12 @@ class InstructionGenerator: return new_data @staticmethod - def load_analysis_pool(file_path, task_id=None): + def load_analysis_pool(file_path, use_fixed_insights, task_id=None): data = InstructionGenerator.load_json_data(file_path) + if use_fixed_insights: + current_directory = os.path.dirname(__file__) + fixed_insights = InstructionGenerator.load_json_data(f"{current_directory}/fixed_insights.json") + data.extend(fixed_insights) for item in data: if "task_id" not in item: raise ValueError("task_id is not found in the analysis pool") @@ -79,8 +84,12 @@ class InstructionGenerator: return data @staticmethod - async def generate_new_instructions(task_id, original_instruction, max_num, file_path): - data = InstructionGenerator.load_analysis_pool(file_path, task_id) + async def generate_new_instructions( + task_id, original_instruction, max_num, file_path, ext_info=None, use_fixed_insights=False + ): + data = InstructionGenerator.load_analysis_pool( + file_path, task_id=task_id, use_fixed_insights=use_fixed_insights + ) new_instructions = [] if len(data) == 0: mcts_logger.log("MCTS", f"No insights available for task {task_id}") @@ -91,12 +100,14 @@ class InstructionGenerator: else: item = data[i] insights = item["Analysis"] - new_instruction = await InstructionGenerator.generate_new_instruction(original_instruction, insights) + new_instruction = await InstructionGenerator.generate_new_instruction( + original_instruction, insights, ext_info + ) new_instructions.append(new_instruction) return new_instructions @staticmethod - async def generate_new_instruction(original_instruction, insights): + async def generate_new_instruction(original_instruction, insights, ext_info): prompt = CHANGE_INSTRUCTION.format(instruction=original_instruction, insights=insights) llm = LLM() context = llm.format_msg([Message(content=prompt, role="user")]) diff --git a/expo/requirements.txt b/expo/requirements.txt index 04de1a8bb..e85818bbe 100644 --- a/expo/requirements.txt +++ b/expo/requirements.txt @@ -3,3 +3,4 @@ openml==0.14.2 # ml module to run in DI xgboost catboost +lightgbm diff --git a/expo/run_experiment.py b/expo/run_experiment.py index 2123fade3..f1b5b2d80 100644 --- a/expo/run_experiment.py +++ b/expo/run_experiment.py @@ -28,11 +28,11 @@ def get_mcts_args(parser): parser.add_argument("--no_load_tree", dest="load_tree", action="store_false") parser.set_defaults(load_tree=False) parser.add_argument("--rollouts", type=int, default=5) + parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true") def get_aug_exp_args(parser): parser.add_argument("--aug_mode", type=str, default="single", choices=["single", "set"]) - parser.add_argument("--num_experiments", type=int, default=1) def get_di_args(parser): @@ -41,6 +41,8 @@ def get_di_args(parser): parser.set_defaults(low_is_better=False) parser.add_argument("--reflection", dest="reflection", action="store_true") parser.add_argument("--no_reflection", dest="reflection", action="store_false") + parser.add_argument("--num_experiments", type=int, default=1) + parser.add_argument("--special_instruction", type=str, default=None, choices=["ag", "stacking"]) parser.set_defaults(reflection=True) diff --git a/metagpt/prompts/task_type.py b/metagpt/prompts/task_type.py index 116756edc..599d437c5 100644 --- a/metagpt/prompts/task_type.py +++ b/metagpt/prompts/task_type.py @@ -11,7 +11,7 @@ The current task is about data preprocessing, please note the following: - Monitor data types per column, applying appropriate methods. - Ensure operations are on existing dataset columns. - Avoid writing processed data to files. -- Avoid any change to label column, such as standardization, etc. +- **ATTENTION** Do NOT make any changes to the label column, such as standardization, etc. - Prefer alternatives to one-hot encoding for categorical data. - Only encode or scale necessary columns to allow for potential feature-specific engineering tasks (like time_extract, binning, extraction, etc.) later. - Each step do data preprocessing to train, must do same for test separately at the same time. @@ -26,7 +26,7 @@ The current task is about feature engineering. when performing it, please adhere - Avoid creating redundant or excessively numerous features in one step. - Exclude ID columns from feature generation and remove them. - Each feature engineering operation performed on the train set must also applies to the dev/test separately at the same time. -- Avoid using the label column to create features, except for cat encoding. +- **ATTENTION** Do NOT use the label column to create features, except for cat encoding. - Use the data from previous task result if exist, do not mock or reload data yourself. - Always copy the DataFrame before processing it and use the copy to process. """ @@ -34,6 +34,9 @@ The current task is about feature engineering. when performing it, please adhere # Prompt for taking on "model_train" tasks MODEL_TRAIN_PROMPT = """ The current task is about training a model, please ensure high performance: +- For tabular datasets - you have access to LightGBM, CatBoost, XGBoost, random forest, extremely randomized trees, k-nearest neighbors, linear regression, etc. +- For image datasets - you have access to ResNet, VGG, Inception, MobileNet, DenseNet, EfficientNet, etc. +- For text datasets - you have access to BERT, GPT-2, RoBERTa, DistilBERT, T5, etc. - Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as XGBoost, CatBoost, etc. - If non-numeric columns exist, perform label encode together with all steps. - Use the data from previous task result directly, do not mock or reload data yourself.