rename expo folder to sela

2026-07-14 16:32:16 +02:00 · 2024-10-22 21:33:31 +08:00 · 2024-10-22 21:33:31 +08:00 · 7c5b29de63
commit 7c5b29de63
parent 4bed19b931
33 changed files with 53 additions and 53 deletions
--- a/sela/experimenter/init.py
+++ b/sela/experimenter/init.py
--- a/sela/experimenter/aide.py
+++ b/sela/experimenter/aide.py
@ -0,0 +1,31 @@
+import aide
+import os
+import time
+
+os.environ["OPENAI_API_KEY"] = "sk-xxx"
+os.environ["OPENAI_BASE_URL"] = "your url"
+start_time = time.time()
+data_dir = "xxx/data/titanic"
+goal = f"""
+# User requirement
+({data_dir}, 'This is a 04_titanic dataset. Your goal is to predict the target column `Survived`.\nPerform data analysis, data preprocessing, feature engineering, and modeling to predict the target. \nReport f1 on the eval data. Do not plot or make any visualizations.\n')
+
+# Data dir
+training (with labels): train.csv
+testing (without labels): test.csv
+dataset description: dataset_info.json (You can use this file to get additional information about the dataset)"""
+
+exp = aide.Experiment(
+    data_dir=data_dir,  # replace this with your own directory
+    goal=goal,
+    eval="f1",  # replace with your own evaluation metric
+)
+
+best_solution = exp.run(steps=10)
+
+print(f"Best solution has validation metric: {best_solution.valid_metric}")
+print(f"Best solution code: {best_solution.code}")
+end_time = time.time()
+execution_time = end_time - start_time
+
+print(f"run time : {execution_time} seconds")
--- a/sela/experimenter/aug.py
+++ b/sela/experimenter/aug.py
@ -0,0 +1,55 @@
+from sela.experimenter.experimenter import Experimenter
+from sela.insights.instruction_generator import InstructionGenerator
+from sela.research_assistant import ResearchAssistant
+from sela.utils import get_exp_pool_path
+
+EXPS_PROMPT = """
+When doing the tasks, you can refer to the insights below:
+{experience}
+
+"""
+
+
+class AugExperimenter(Experimenter):
+    result_path: str = "results/aug"
+
+    async def run_experiment(self):
+        # state = create_initial_state(self.args.task, start_task_id=1, data_config=self.data_config, low_is_better=self.args.low_is_better, name="")
+        user_requirement = self.state["requirement"]
+        exp_pool_path = get_exp_pool_path(self.args.task, self.data_config, pool_name="ds_analysis_pool")
+        exp_pool = InstructionGenerator.load_analysis_pool(
+            exp_pool_path, use_fixed_insights=self.args.use_fixed_insights
+        )
+        if self.args.aug_mode == "single":
+            exps = InstructionGenerator._random_sample(exp_pool, self.args.num_experiments)
+            exps = [exp["Analysis"] for exp in exps]
+        elif self.args.aug_mode == "set":
+            exps = []
+            for i in range(self.args.num_experiments):
+                exp_set = InstructionGenerator.sample_instruction_set(exp_pool)
+                exp_set_text = "\n".join([f"{exp['task_id']}: {exp['Analysis']}" for exp in exp_set])
+                exps.append(exp_set_text)
+        else:
+            raise ValueError(f"Invalid mode: {self.args.aug_mode}")
+
+        results = []
+        for i in range(self.args.num_experiments):
+            di = ResearchAssistant(
+                node_id=str(i), use_reflection=self.args.reflection, role_timeout=self.args.role_timeout
+            )
+            di.role_dir = f"{di.role_dir}_{self.args.task}"
+            requirement = user_requirement + EXPS_PROMPT.format(experience=exps[i])
+            print(requirement)
+            score_dict = await self.run_di(di, requirement, run_idx=i)
+            results.append(
+                {
+                    "idx": i,
+                    "score_dict": score_dict,
+                    "aug_mode": self.args.aug_mode,
+                    "insights": exps[i],
+                    "user_requirement": requirement,
+                    "args": vars(self.args),
+                }
+            )
+        results = self.summarize_results(results)
+        self.save_result(results)
--- a/sela/experimenter/autogluon.py
+++ b/sela/experimenter/autogluon.py
@ -0,0 +1,126 @@
+from datetime import datetime
+from sela.experimenter.custom import CustomExperimenter
+import os
+import pandas as pd
+
+
+class AGRunner:
+    def __init__(self, state=None):
+        self.state = state
+        self.datasets = self.state["datasets_dir"]
+
+    def run(self):
+        from autogluon.tabular import TabularDataset, TabularPredictor
+        train_path = self.datasets["train"]
+        dev_path = self.datasets["dev"]
+        dev_wo_target_path = self.datasets["dev_wo_target"]
+        test_wo_target_path = self.datasets["test_wo_target"]
+        target_col = self.state["dataset_config"]["target_col"]
+        train_data = TabularDataset(train_path)
+        dev_data = TabularDataset(dev_path)
+        dev_wo_target_data = TabularDataset(dev_wo_target_path)
+        test_data = TabularDataset(test_wo_target_path)
+        eval_metric = self.state["dataset_config"]["metric"].replace(" ", "_")
+        predictor = TabularPredictor(
+            label=target_col,
+            eval_metric=eval_metric,
+            path="AutogluonModels/ag-{}-{}".format(self.state["task"], datetime.now().strftime("%y%m%d_%H%M")),
+        ).fit(train_data=train_data, tuning_data=dev_data, num_gpus=1)
+        dev_preds = predictor.predict(dev_wo_target_data)
+        test_preds = predictor.predict(test_data)
+        return {"test_preds": test_preds, "dev_preds": dev_preds}
+
+    def run_multimodal(self):
+        from autogluon.multimodal import MultiModalPredictor
+        target_col = self.state["dataset_config"]["target_col"]
+        train_path = self.datasets["train"]
+        dev_path = self.datasets["dev"]
+        dev_wo_target_path = self.datasets["dev_wo_target"]  # Updated variable name
+        test_wo_target_path = self.datasets["test_wo_target"]
+        eval_metric = self.state["dataset_config"]["metric"].replace(" ", "_")
+
+        # Load the datasets
+        train_data, dev_data, dev_wo_target_data, test_data = self.load_split_dataset(
+            train_path, dev_path, dev_wo_target_path, test_wo_target_path
+        )
+
+        # Create and fit the predictor
+        predictor = MultiModalPredictor(
+            label=target_col,
+            eval_metric=eval_metric,
+            path="AutogluonModels/ag-{}-{}".format(self.state["task"], datetime.now().strftime("%y%m%d_%H%M")),
+        ).fit(train_data=train_data, tuning_data=dev_data)
+
+        # Make predictions on dev and test datasets
+        dev_preds = predictor.predict(dev_wo_target_data)
+        test_preds = predictor.predict(test_data)
+
+        # Return predictions for dev and test datasets
+        return {
+            "dev_preds": dev_preds,
+            "test_preds": test_preds
+        }
+
+    def load_split_dataset(self, train_path, dev_path, dev_wo_target_path, test_wo_target_path):
+        """
+        Loads training, dev, and test datasets from given file paths
+
+        Args:
+            train_path (str): Path to the training dataset.
+            dev_path (str): Path to the dev dataset with target labels.
+            dev_wo_target_path (str): Path to the dev dataset without target labels.
+            test_wo_target_path (str): Path to the test dataset without target labels.
+
+        Returns:
+            train_data (pd.DataFrame): Loaded training dataset with updated image paths.
+            dev_data (pd.DataFrame): Loaded dev dataset with updated image paths.
+            dev_wo_target_data (pd.DataFrame): Loaded dev dataset without target labels and updated image paths.
+            test_data (pd.DataFrame): Loaded test dataset with updated image paths.
+        """
+
+        # Define the root path to append
+        root_folder = os.path.join("F:/Download/Dataset/", self.state["task"])
+
+        # Load the datasets
+        train_data = pd.read_csv(train_path)
+        dev_data = pd.read_csv(dev_path)  # Load dev dataset with target labels
+        dev_wo_target_data = pd.read_csv(dev_wo_target_path)  # Load dev dataset without target labels
+        test_data = pd.read_csv(test_wo_target_path)
+
+        # Get the name of the first column (assuming it's the image path column)
+        image_column = train_data.columns[0]
+
+        # Append root folder path to the image column in each dataset
+        train_data[image_column] = train_data[image_column].apply(lambda x: os.path.join(root_folder, x))
+        dev_data[image_column] = dev_data[image_column].apply(lambda x: os.path.join(root_folder, x))
+        dev_wo_target_data[image_column] = dev_wo_target_data[image_column].apply(
+            lambda x: os.path.join(root_folder, x))
+        test_data[image_column] = test_data[image_column].apply(lambda x: os.path.join(root_folder, x))
+
+        return train_data, dev_data, dev_wo_target_data, test_data
+
+
+class GluonExperimenter(CustomExperimenter):
+    result_path: str = "results/autogluon"
+
+    def __init__(self, args, **kwargs):
+        super().__init__(args, **kwargs)
+        self.framework = AGRunner(self.state)
+        self.is_multimodal = args.is_multimodal if hasattr(args, 'is_multimodal') else False
+
+    async def run_experiment(self):
+        if not self.is_multimodal:
+            result = self.framework.run()
+        else:
+            result = self.framework.run_multimodal()
+
+        assert result is not None
+        user_requirement = self.state["requirement"]
+        dev_preds = result["dev_preds"]
+        test_preds = result["test_preds"]
+        score_dict = {
+            "dev_score": self.evaluate_predictions(dev_preds, "dev"),
+            "test_score": self.evaluate_predictions(test_preds, "test"),
+        }
+        results = [0, {"score_dict": score_dict, "user_requirement": user_requirement, "args": vars(self.args)}]
+        self.save_result(results)
--- a/sela/experimenter/autosklearn.py
+++ b/sela/experimenter/autosklearn.py
@ -0,0 +1,96 @@
+from datetime import datetime
+import pandas as pd
+from sela.experimenter.custom import CustomExperimenter
+from sela.evaluation.evaluation import evaluate_score
+from functools import partial
+
+
+def custom_scorer(y_true, y_pred, metric_name):
+    return evaluate_score(y_pred, y_true, metric_name)
+
+
+class ASRunner:
+    time_limit = 600
+
+    def __init__(self, state=None):
+        self.state = state
+        self.datasets = self.state["datasets_dir"]
+
+    def create_autosklearn_scorer(self, metric_name):
+        from autosklearn.metrics import make_scorer
+
+        return make_scorer(
+            name=metric_name, score_func=partial(custom_scorer, metric_name=metric_name)
+        )
+
+    def run(self):
+        import autosklearn.classification
+        import autosklearn.regression
+
+        train_path = self.datasets["train"]
+        dev_wo_target_path = self.datasets["dev_wo_target"]
+        test_wo_target_path = self.datasets["test_wo_target"]
+        target_col = self.state["dataset_config"]["target_col"]
+
+        train_data = pd.read_csv(train_path)
+        dev_data = pd.read_csv(dev_wo_target_path)
+        test_data = pd.read_csv(test_wo_target_path)
+        eval_metric = self.state["dataset_config"]["metric"]
+        X_train = train_data.drop(columns=[target_col])
+        y_train = train_data[target_col]
+
+        if eval_metric == "rmse":
+            automl = autosklearn.regression.AutoSklearnRegressor(
+                time_left_for_this_task=self.time_limit,
+                metric=self.create_autosklearn_scorer(eval_metric),
+                memory_limit=8192,
+                tmp_folder="AutosklearnModels/as-{}-{}".format(
+                    self.state["task"], datetime.now().strftime("%y%m%d_%H%M")
+                ),
+                n_jobs=-1,
+            )
+        elif eval_metric in ["f1", "f1 weighted"]:
+            automl = autosklearn.classification.AutoSklearnClassifier(
+                time_left_for_this_task=self.time_limit,
+                metric=self.create_autosklearn_scorer(eval_metric),
+                memory_limit=8192,
+                tmp_folder="AutosklearnModels/as-{}-{}".format(
+                    self.state["task"], datetime.now().strftime("%y%m%d_%H%M")
+                ),
+                n_jobs=-1,
+            )
+        else:
+            raise ValueError(f"Unsupported metric: {eval_metric}")
+        automl.fit(X_train, y_train)
+
+        dev_preds = automl.predict(dev_data)
+        test_preds = automl.predict(test_data)
+
+        return {"test_preds": test_preds, "dev_preds": dev_preds}
+
+
+class AutoSklearnExperimenter(CustomExperimenter):
+    result_path: str = "results/autosklearn"
+
+    def __init__(self, args, **kwargs):
+        super().__init__(args, **kwargs)
+        self.framework = ASRunner(self.state)
+
+    async def run_experiment(self):
+        result = self.framework.run()
+        user_requirement = self.state["requirement"]
+        dev_preds = result["dev_preds"]
+        test_preds = result["test_preds"]
+        score_dict = {
+            "dev_score": self.evaluate_predictions(dev_preds, "dev"),
+            "test_score": self.evaluate_predictions(test_preds, "test"),
+        }
+        results = [
+            0,
+            {
+                "score_dict": score_dict,
+                "user_requirement": user_requirement,
+                "args": vars(self.args),
+            },
+        ]
+        self.save_result(results)
--- a/sela/experimenter/custom.py
+++ b/sela/experimenter/custom.py
@ -0,0 +1,62 @@
+import os
+
+import pandas as pd
+
+from sela.evaluation.evaluation import evaluate_score
+from sela.experimenter.experimenter import Experimenter
+from sela.MCTS import create_initial_state
+
+
+class CustomExperimenter(Experimenter):
+    result_path: str = "results/custom"
+
+    def __init__(self, args, **kwargs):
+        super().__init__(args, **kwargs)
+        self.framework = kwargs.get("framework", None)  # todo
+        self.task = kwargs.get("task", self.args.task)
+        self.low_is_better = kwargs.get("low_is_better", self.args.low_is_better)
+        self.name = kwargs.get("name", "")
+        self.result_path = f"results/custom_{self.name}"
+        self.state = create_initial_state(
+            self.task,
+            start_task_id=1,
+            data_config=self.data_config,
+            args=self.args,
+        )
+
+    def run_experiment(self):
+        user_requirement = self.state["requirement"]
+        preds = self.framework.run(user_requirement)
+        test_preds = preds["test_preds"]
+        dev_preds = preds["dev_preds"]
+        score_dict = {
+            "dev_score": self.evaluate_predictions(dev_preds, "dev"),
+            "test_score": self.evaluate_predictions(test_preds, "test"),
+        }
+        results = {"score_dict": score_dict, "user_requirement": user_requirement, "args": vars(self.args)}
+        self.save_result(results)
+
+    def evaluate_pred_files(self, dev_pred_path, test_pred_path):
+        dev_preds = pd.read_csv(dev_pred_path)["target"]
+        test_preds = pd.read_csv(test_pred_path)["target"]
+        score_dict = {
+            "dev_score": self.evaluate_score(dev_preds, "dev"),
+            "test_score": self.evaluate_score(test_preds, "test"),
+        }
+        return score_dict
+
+    def evaluate_predictions(self, preds, split):
+        metric = self.state["dataset_config"]["metric"]
+        gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
+        gt = pd.read_csv(gt_path)["target"]
+        score = evaluate_score(preds, gt, metric)
+        return score
+
+    def load_datasets(self):
+        train_path = self.state["datasets_dir"]["train"]
+        dev_path = self.state["datasets_dir"]["dev"]
+        test_path = self.state["datasets_dir"]["test"]
+        train = pd.read_csv(train_path)
+        dev = pd.read_csv(dev_path)
+        test = pd.read_csv(test_path)
+        return train, dev, test
--- a/sela/experimenter/experimenter.py
+++ b/sela/experimenter/experimenter.py
@ -0,0 +1,135 @@
+import datetime
+import json
+import os
+
+import numpy as np
+import pandas as pd
+
+from sela.evaluation.evaluation import evaluate_score
+from sela.MCTS import create_initial_state
+from sela.research_assistant import ResearchAssistant
+from sela.utils import DATA_CONFIG, save_notebook
+
+
+class Experimenter:
+    result_path: str = "results/base"
+    data_config = DATA_CONFIG
+    start_task_id = 1
+
+    def __init__(self, args, **kwargs):
+        self.args = args
+        self.start_time_raw = datetime.datetime.now()
+        self.start_time = self.start_time_raw.strftime("%Y%m%d%H%M")
+        self.state = create_initial_state(
+            self.args.task,
+            start_task_id=self.start_task_id,
+            data_config=self.data_config,
+            args=self.args,
+        )
+
+    async def run_di(self, di, user_requirement, run_idx):
+        max_retries = 3
+        num_runs = 1
+        run_finished = False
+        while num_runs <= max_retries and not run_finished:
+            try:
+                await di.run(user_requirement)
+                score_dict = await di.get_score()
+                score_dict = self.evaluate(score_dict, self.state)
+                run_finished = True
+            except Exception as e:
+                print(f"Error: {e}")
+                num_runs += 1
+        # save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
+        save_name = self.get_save_name()
+        save_notebook(role=di, save_dir=self.result_path, name=f"{save_name}_{run_idx}")
+
+        if not run_finished:
+            score_dict = {"train_score": -1, "dev_score": -1, "test_score": -1, "score": -1}
+        return score_dict
+
+    def summarize_results(self, results):
+        dev_scores = [result["score_dict"]["dev_score"] for result in results]
+        best_dev_score = (
+            max(dev_scores)
+            if not self.args.low_is_better
+            else min([score for score in dev_scores if score != -1] + [np.inf])
+        )
+        best_score_idx = dev_scores.index(best_dev_score)
+
+        test_scores = [result["score_dict"]["test_score"] for result in results]
+        avg_score = sum(test_scores) / len(test_scores)
+        global_best_score = (
+            max(test_scores)
+            if not self.args.low_is_better
+            else min([score for i, score in enumerate(test_scores) if dev_scores[i] != -1] + [np.inf])
+        )
+
+        results.insert(
+            0,
+            {
+                "best_dev_score": best_dev_score,
+                "best_dev_score_idx": best_score_idx,
+                "best_dev_test_score": test_scores[best_score_idx],
+                "avg_test_score": avg_score,
+                "global_best_test_score": global_best_score,
+            },
+        )
+        return results
+
+    async def run_experiment(self):
+        state = self.state
+        user_requirement = state["requirement"]
+        results = []
+
+        for i in range(self.args.num_experiments):
+            di = ResearchAssistant(
+                node_id="0", use_reflection=self.args.reflection, role_timeout=self.args.role_timeout
+            )
+            score_dict = await self.run_di(di, user_requirement, run_idx=i)
+            results.append(
+                {"idx": i, "score_dict": score_dict, "user_requirement": user_requirement, "args": vars(self.args)}
+            )
+            self.save_result(results)  # save intermediate results
+        results = self.summarize_results(results)
+
+        self.save_result(results)
+
+    def evaluate_prediction(self, split, state):
+        pred_path = os.path.join(state["work_dir"], state["task"], f"{split}_predictions.csv")
+        os.makedirs(state["node_dir"], exist_ok=True)
+        pred_node_path = os.path.join(state["node_dir"], f"{self.start_time}-{split}_predictions.csv")
+        gt_path = os.path.join(state["datasets_dir"][f"{split}_target"])
+        preds = pd.read_csv(pred_path)
+        preds = preds[preds.columns.tolist()[-1]]
+        preds.to_csv(pred_node_path, index=False)
+        gt = pd.read_csv(gt_path)["target"]
+        metric = state["dataset_config"]["metric"]
+        os.remove(pred_path)
+        return evaluate_score(preds, gt, metric)
+
+    def evaluate(self, score_dict, state):
+        scores = {
+            "dev_score": self.evaluate_prediction("dev", state),
+            "test_score": self.evaluate_prediction("test", state),
+        }
+        score_dict.update(scores)
+        return score_dict
+
+    def get_save_name(self):
+        return f"{self.args.exp_mode}-{self.args.task}_{self.start_time}"
+
+    def save_result(self, result):
+        end_time_raw = datetime.datetime.now()
+        end_time = end_time_raw.strftime("%Y%m%d%H%M")
+        time_info = {
+            "start_time": self.start_time,
+            "end_time": end_time,
+            "duration (seconds)": (end_time_raw - self.start_time_raw).seconds,
+        }
+        result = result.copy()
+        result.insert(0, time_info)
+        save_name = self.get_save_name()
+        os.makedirs(self.result_path, exist_ok=True)
+        with open(f"{self.result_path}/{save_name}.json", "w") as f:
+            json.dump(result, f, indent=4)
--- a/sela/experimenter/mcts.py
+++ b/sela/experimenter/mcts.py
@ -0,0 +1,81 @@
+import shutil
+
+from sela.evaluation.evaluation import (
+    node_evaluate_score_mlebench,
+    node_evaluate_score_sela,
+)
+from sela.evaluation.visualize_mcts import get_tree_text
+from sela.experimenter.experimenter import Experimenter
+from sela.Greedy import Greedy, Random
+from sela.MCTS import MCTS
+
+
+class MCTSExperimenter(Experimenter):
+    result_path: str = "results/mcts"
+
+    def __init__(self, args, tree_mode=None, **kwargs):
+        if args.special_instruction == "image":
+            self.start_task_id = 1  # start from datapreprocessing if it is image task
+        else:
+            self.start_task_id = args.start_task_id
+
+        if args.eval_func == "sela":
+            self.eval_func = node_evaluate_score_sela
+        elif args.eval_func == "mlebench":
+            self.eval_func = node_evaluate_score_mlebench
+
+        super().__init__(args, **kwargs)
+        self.tree_mode = tree_mode
+
+    async def run_experiment(self):
+        use_fixed_insights = self.args.use_fixed_insights
+        depth = self.args.max_depth
+        if self.tree_mode == "greedy":
+            mcts = Greedy(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
+        elif self.tree_mode == "random":
+            mcts = Random(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
+        else:
+            mcts = MCTS(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
+        best_nodes = await mcts.search(state=self.state, args=self.args)
+        best_node = best_nodes["global_best"]
+        dev_best_node = best_nodes["dev_best"]
+        score_dict = best_nodes["scores"]
+        additional_scores = {"grader": self.eval_func(dev_best_node)}
+
+        text, num_generated_codes = get_tree_text(mcts.root_node)
+        text += f"Generated {num_generated_codes} unique codes.\n"
+        text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
+        text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
+        text += f"Grader score: {additional_scores['grader']}\n"
+        print(text)
+        results = [
+            {
+                "best_node": best_node.id,
+                "best_node_score": best_node.raw_reward,
+                "dev_best_node": dev_best_node.id,
+                "dev_best_node_score": dev_best_node.raw_reward,
+                "num_generated_codes": num_generated_codes,
+                "user_requirement": best_node.state["requirement"],
+                "tree_text": text,
+                "args": vars(self.args),
+                "scores": score_dict,
+                "additional_scores": additional_scores,
+            }
+        ]
+        self.save_result(results)
+        self.copy_notebook(best_node, "best")
+        self.copy_notebook(dev_best_node, "dev_best")
+        self.save_tree(text)
+
+    def copy_notebook(self, node, name):
+        node_dir = node.get_node_dir()
+        node_nb_dir = f"{node_dir}/Node-{node.id}.ipynb"
+        save_name = self.get_save_name()
+        copy_nb_dir = f"{self.result_path}/{save_name}_{name}.ipynb"
+        shutil.copy(node_nb_dir, copy_nb_dir)
+
+    def save_tree(self, tree_text):
+        save_name = self.get_save_name()
+        fpath = f"{self.result_path}/{save_name}_tree.txt"
+        with open(fpath, "w") as f:
+            f.write(tree_text)
--- a/sela/experimenter/mle_bench/instructions.py
+++ b/sela/experimenter/mle_bench/instructions.py
@ -0,0 +1,48 @@
+SUBMISSION_INSTURCTION = """
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+"""
+INSTRUCTIONS = """
+You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `sela.experimenter.mle_bench.instructions.py`.
+
+- **Competition-specific instructions**: The instructions for the competition are available in `{dataset_dir}/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
+- **Dataset**: The dataset files for the competition are available in `{dataset_dir}`.
+- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `{output_dir}/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `{dataset_dir}` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
+- **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task. 
+
+Wherever there is a conflict between the points above and the competition-specific instructions in `{dataset_dir}/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+"""
+
+INSTRUCTIONS_OBFUSCATED = """
+You are participating in *MLE-bench*, a benchmark designed to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `sela.experimenter.mle_bench.instructions.py`.
+
+- **Task-specific instructions**: The instructions for the task are available in `{dataset_dir}/description.md`. Read this file to understand the problem setup for the task you are currently solving.
+- **Dataset**: The dataset files for the task are available in `{dataset_dir}/`.
+- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `{output_dir}/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `{dataset_dir}` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
+- **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task. 
+
+Wherever there is a conflict between the points above and the task-specific instructions in `{dataset_dir}/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+"""
+
+ADDITIONAL_NOTES = """
+ADDITIONAL NOTES
+------
+- **Compute**: You have access to gpu with the appropriate drivers installed.
+"""