diff --git a/expo/MCTS.py b/expo/MCTS.py index 7c03e2e86..b2ad824e5 100644 --- a/expo/MCTS.py +++ b/expo/MCTS.py @@ -3,6 +3,7 @@ import os import pickle import random +import numpy as np import pandas as pd from expo.dataset import generate_task_requirement, get_split_dataset_path @@ -209,16 +210,20 @@ class Node: await role.run(with_message="continue") else: await role.run(with_message=self.state["requirement"]) + score_dict = await role.get_score() + score_dict = self.evaluate_simulation(score_dict) + self.raw_reward = score_dict run_finished = True except Exception as e: mcts_logger.log("MCTS", f"Error in running the role: {e}") num_runs += 1 if not run_finished: mcts_logger.log("MCTS", f"Role {role.node_id} failed to run") - return {"test_score": 0, "dev_score": 0, "score": 0} - score_dict = await role.get_score() - score_dict = self.evaluate_simulation(score_dict) - self.raw_reward = score_dict + if self.state["low_is_better"]: + score_dict = {"test_score": np.inf, "dev_score": np.inf, "score": np.inf} + else: + score_dict = {"test_score": 0, "dev_score": 0, "score": 0} + self.raw_reward = score_dict if self.state["low_is_better"]: # normalized the score to be between 0 and 1, and higher is better def normalize_score(score): diff --git a/expo/README.md b/expo/README.md index 4cc4daf25..eab3298dc 100644 --- a/expo/README.md +++ b/expo/README.md @@ -33,15 +33,17 @@ ### Budget 实验轮次 k = 10, 20 -### 提示词使用 +### Prompt Usage - 通过执行`dataset.py`中的`generate_task_requirement`函数获取提示词 -- 每一个数据集里有`dataset_info.json`,里面的内容需要提供给baselines以保证公平 + - 非DI-based方法设置`is_di=False` + - `data_config`用`utils.DATA_CONFIG` +- 每一个数据集里有`dataset_info.json`,里面的内容需要提供给baselines以保证公平(`generate_task_requirement`已经默认提供) ## 3. Evaluation -运行各个框架,运行后框架需要提供Dev和Test的`dev_predictions.csv`和`test_predictions.csv`, column name为target +运行各个框架,运行后框架需要提供Dev和Test的`dev_predictions.csv`和`test_predictions.csv`,每个csv文件只需要单个名为target的列 - 使用`CustomExperimenter` ``` @@ -62,11 +64,6 @@ #### Setup ``` pip install -U pip pip install -U setuptools wheel - -CPU version of pytorch has smaller footprint - see installation instructions in -pytorch documentation - https://pytorch.org/get-started/locally/ -pip install torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cpu - pip install autogluon ``` @@ -105,11 +102,11 @@ #### Setup #### Run -- `python run_experiment.py --exp_mode mcts --task titanic --rollout 5` +- `python run_experiment.py --exp_mode mcts --task titanic --rollout 10` If the dataset has reg metric, remember to use `--low_is_better`: -- `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 5 --low_is_better` +- `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 10 --low_is_better` diff --git a/expo/dataset.py b/expo/dataset.py index fee1199a9..f7e0301b5 100644 --- a/expo/dataset.py +++ b/expo/dataset.py @@ -16,9 +16,8 @@ Perform data analysis, data preprocessing, feature engineering, and modeling to Report {metric} on the eval data. Do not plot or make any visualizations. """ -TASK_PROMPT = """\ -# User requirement -{user_requirement} + +DI_INSTRUCTION = """\ **Attention** 1. Please do not leak the target label in any form during training. 2. Dev and Test sets do not have the target column. @@ -39,14 +38,19 @@ Print the training set performance in the last step. Write in this format: print("Train score:", train_score) ``` +# Output dir +{output_dir} +""" + +TASK_PROMPT = """\ +# User requirement +{user_requirement} +{additional_instruction} # Data dir training (with labels): {train_path} dev (without labels): {dev_path} testing (without labels): {test_path} - -# Output dir -{output_dir} - +dataset description: {data_info_path} (You can use this file to get additional information about the dataset) """ @@ -132,7 +136,12 @@ def create_dataset_dict(dataset): return dataset_dict -def generate_task_requirement(task_name, data_config): +def generate_di_instruction(output_dir): + additional_instruction = DI_INSTRUCTION.format(output_dir=output_dir) + return additional_instruction + + +def generate_task_requirement(task_name, data_config, is_di=True): user_requirement = get_user_requirement(task_name, data_config) split_dataset_path = get_split_dataset_path(task_name, data_config) train_path = split_dataset_path["train"] @@ -140,12 +149,19 @@ def generate_task_requirement(task_name, data_config): test_path = split_dataset_path["test_wo_target"] work_dir = data_config["work_dir"] output_dir = f"{work_dir}/{task_name}" + datasets_dir = data_config["datasets_dir"] + data_info_path = f"{datasets_dir}/{task_name}/dataset_info.json" + if is_di: + additional_instruction = generate_di_instruction(output_dir) + else: + additional_instruction = "" user_requirement = TASK_PROMPT.format( user_requirement=user_requirement, train_path=train_path, dev_path=dev_path, test_path=test_path, - output_dir=output_dir, + additional_instruction=additional_instruction, + data_info_path=data_info_path, ) print(user_requirement) return user_requirement diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py index 83dde80b9..4161aef3d 100644 --- a/expo/experimenter/experimenter.py +++ b/expo/experimenter/experimenter.py @@ -54,11 +54,15 @@ class Experimenter: {"idx": i, "score_dict": score_dict, "user_requirement": user_requirement, "args": vars(self.args)} ) self.save_result(results) # save intermediate results - dev_scores = [result["score_dict"]["dev_score"] for result in results] + dev_scores = [ + result["score_dict"]["dev_score"] for result in results if result["score_dict"]["dev_score"] != -1 + ] best_dev_score = max(dev_scores) if not self.args.low_is_better else min(dev_scores) best_score_idx = dev_scores.index(best_dev_score) - test_scores = [result["score_dict"]["test_score"] for result in results] + test_scores = [ + result["score_dict"]["test_score"] for result in results if result["score_dict"]["dev_score"] != -1 + ] avg_score = sum(test_scores) / len(test_scores) global_best_score = max(test_scores) if not self.args.low_is_better else min(test_scores) diff --git a/expo/run_exp_augmentation.py b/expo/run_exp_augmentation.py deleted file mode 100644 index 7fb174ff7..000000000 --- a/expo/run_exp_augmentation.py +++ /dev/null @@ -1,103 +0,0 @@ -import argparse -import asyncio -import datetime -import json -import os - -import pandas as pd - -from expo.dataset import generate_task_requirement -from expo.evaluation.evaluation import evaluate_score -from expo.insights.instruction_generator import InstructionGenerator -from expo.MCTS import create_initial_state -from expo.research_assistant import ResearchAssistant -from expo.utils import DATA_CONFIG, get_exp_pool_path - -EXPS_PROMPT = """ -When doing the tasks, you can refer to the insights below: -{experience} - -""" -data_config = DATA_CONFIG - - -def evaluate_test(score, state): - datetime_text = datetime.datetime.now().strftime("%Y%m%d%H%M") - task_name = state["task"] - prediction_fpath = os.path.join(state["work_dir"], task_name, "predictions.csv") - predictions = pd.read_csv(prediction_fpath)["target"] - # copy predictions.csv to the node_dir - - predictions_node_fpath = os.path.join("results", f"{task_name}-{datetime_text}-predictions.csv") - predictions.to_csv(predictions_node_fpath, index=False) - # load test_target.csv - split_datasets_dir = state["datasets_dir"] - gt = pd.read_csv(os.path.join(split_datasets_dir["test_target"]))["target"] - metric = state["dataset_config"]["metric"] - score["test_score"] = evaluate_score(predictions, gt, metric) - return score - - -async def main(task_name, use_reflection=True, mode="single", num_experiments=2): - """ - mode: single or set - single: sample one instruction - set: sample a set of instructions - """ - low_is_better = False - state = create_initial_state( - task_name, start_task_id=1, data_config=data_config, low_is_better=low_is_better, name="" - ) - - user_requirement = generate_task_requirement(task_name, data_config) - exp_pool_path = get_exp_pool_path(task_name, data_config, pool_name="ds_analysis_pool") - exp_pool = InstructionGenerator.load_analysis_pool(exp_pool_path) - if mode == "single": - exps = InstructionGenerator._random_sample(exp_pool, num_experiments) - exps = [exp["Analysis"] for exp in exps] - elif mode == "set": - exp_set = InstructionGenerator.sample_instruction_set(exp_pool) - exp_set_text = "\n".join([f"{exp['task_id']}: {exp['Analysis']}" for exp in exp_set]) - exps = [exp_set_text] * num_experiments - else: - raise ValueError(f"Invalid mode: {mode}") - - scores = [] - for i in range(num_experiments): - di = ResearchAssistant(node_id=str(i), use_reflection=use_reflection) - di.role_dir = f"{di.role_dir}_{task_name}" - requirement = user_requirement + EXPS_PROMPT.format(experience=exps[i]) - print(requirement) - await di.run(requirement) - score = await di.get_score(low_is_better=False) - score = evaluate_test(score, state) - - scores.append(score) - - with open(f"results/{task_name}_scores.json", "w") as f: - # save scores and corresponding insights - results = { - "avg_score": sum([score["test_score"] for score in scores if score]) / num_experiments, - "max_score": max([score["test_score"] for score in scores]), - "scores": scores, - "insights": exps, - } - json.dump(results, f, indent=4) - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--task", type=str, default="titanic") - parser.add_argument("--use_reflection", dest="use_reflection", action="store_true") - parser.add_argument("--no_use_reflection", dest="use_reflection", action="store_false") - parser.set_defaults(use_reflection=True) - parser.add_argument("--mode", type=str, default="single") - parser.add_argument("--num_experiments", type=int, default=2) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - asyncio.run( - main(args.task, use_reflection=args.use_reflection, mode=args.mode, num_experiments=args.num_experiments) - ) diff --git a/expo/run_mcts.py b/expo/run_mcts.py deleted file mode 100644 index 4577417a9..000000000 --- a/expo/run_mcts.py +++ /dev/null @@ -1,58 +0,0 @@ -import argparse -import asyncio - -from expo.evaluation.visualize_mcts import get_tree_text -from expo.MCTS import MCTS -from expo.utils import load_data_config - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--task", type=str, default="titanic") - parser.add_argument("--low_is_better", dest="low_is_better", action="store_true") - parser.set_defaults(low_is_better=False) - parser.add_argument("--load_tree", dest="load_tree", action="store_true") - parser.add_argument("--no_load_tree", dest="load_tree", action="store_false") - parser.set_defaults(load_tree=True) - parser.add_argument("--reflection", dest="reflection", action="store_true") - parser.add_argument("--no_reflection", dest="reflection", action="store_false") - parser.set_defaults(reflection=True) - parser.add_argument("--rollouts", type=int, default=3) - parser.add_argument("--name", type=str, default="") - return parser.parse_args() - - -data_config = load_data_config() - -if __name__ == "__main__": - args = get_args() - # requirement = generate_task_requirement(args.task, data_config) - # print(requirement) - - # role, root_node = initialize_di_root_node(requirement, data_config) - # asyncio.run(role.run(requirement)) - - # asyncio.run(root_node.run_node()) - mcts = MCTS(root_node=None, max_depth=5) - best_nodes = asyncio.run( - mcts.search( - args.task, - data_config, - low_is_better=args.low_is_better, - load_tree=args.load_tree, - reflection=args.reflection, - rollouts=args.rollouts, - name=args.name, - ) - ) - best_node = best_nodes["global_best"] - dev_best_node = best_nodes["dev_best"] - text, num_generated_codes = get_tree_text(mcts.root_node) - print(text) - print(f"Generated {num_generated_codes} unique codes.") - - with open(f"results/{args.task}_tree{args.name}.txt", "w") as f: - f.write(f"Generated {num_generated_codes} unique codes.\n") - f.write(f"Best node: {best_node}, score: {best_node.raw_reward}\n") - f.write(f"Dev best node: {dev_best_node}, score: {dev_best_node.raw_reward}\n") - f.write(text)