From eda9322361cc112fc34cb340365688123272b2a3 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Fri, 11 Oct 2024 18:37:35 +0800
Subject: [PATCH 01/22] 1. dynamically add insight 2. insight from scratch in
 real time

---
 expo/MCTS.py                           | 14 +++++--
 expo/experimenter/mcts.py              |  7 +---
 expo/insights/instruction_generator.py | 49 ++++++++++++++++++++---
 expo/insights/solution_designer.py     | 55 ++++++++++++++++++++++++--
 expo/research_assistant.py             |  5 +++
 expo/run_experiment.py                 |  3 ++
 6 files changed, 115 insertions(+), 18 deletions(-)

diff --git a/expo/MCTS.py b/expo/MCTS.py
index 9d778e4ed..7e1d7c88a 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -235,7 +235,8 @@ class Node:
 
             score_dict = {k: normalize_score(v) for k, v in score_dict.items()}
         self.normalized_reward = score_dict
-        return score_dict
+        result_dict = role.get_solution()
+        return score_dict, result_dict
 
 
 class MCTS:
@@ -281,7 +282,7 @@ class MCTS:
         mcts_logger.log("MCTS", f"Start simulating node {node.id}:")
         while node.children:
             node = random.choice(node.children)
-        reward = await node.run_node(role)
+        reward, result_dict = await node.run_node(role)
         mcts_logger.log("MCTS", f"Simulated node's reward: {reward}")
 
         return reward
@@ -341,12 +342,17 @@ class MCTS:
             scores["test_raw"].append(node.raw_reward["test_score"])
         return scores
 
-    async def search(self, state, rollouts, load_tree=False, reflection=False):
+    async def search(self, state, args):
+        reflection = args.reflection
+        load_tree = args.load_tree
+        rollouts = args.rollouts
+        from_scratch = args.from_scratch
         role, root = initialize_di_root_node(state, reflection=reflection)
         self.root_node = root
         self.instruction_generator = InstructionGenerator(
-            file_path=state["exp_pool_path"], use_fixed_insights=self.use_fixed_insights
+            state=state, use_fixed_insights=self.use_fixed_insights, from_scratch=from_scratch
         )
+        await self.instruction_generator.initialize()
 
         tree_loaded = False
         if load_tree:
diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py
index c063268c8..d212eb204 100644
--- a/expo/experimenter/mcts.py
+++ b/expo/experimenter/mcts.py
@@ -24,12 +24,7 @@ class MCTSExperimenter(Experimenter):
             mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
         else:
             mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
-        best_nodes = await mcts.search(
-            state=self.state,
-            reflection=self.args.reflection,
-            rollouts=self.args.rollouts,
-            load_tree=self.args.load_tree,
-        )
+        best_nodes = await mcts.search(state=self.state, args=self.args)
         best_node = best_nodes["global_best"]
         dev_best_node = best_nodes["dev_best"]
         score_dict = best_nodes["scores"]
diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py
index 330795730..7fa4d72ea 100644
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@@ -1,6 +1,7 @@
 import json
 import os
 import random
+from difflib import SequenceMatcher
 
 from expo.insights.solution_designer import SolutionDesigner
 from expo.utils import clean_json_from_rsp, load_data_config, mcts_logger
@@ -33,11 +34,21 @@ DATA_CONFIG = load_data_config()
 class InstructionGenerator:
     data_config = DATA_CONFIG
 
-    def __init__(self, file_path, use_fixed_insights=False):
-        self.file_path = file_path
+    def __init__(self, state, use_fixed_insights, from_scratch):
+        self.state = state
+        self.file_path = state["exp_pool_path"]
+        self.dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
+        with open(self.dataset_info_path, "r") as file:
+            self.dataset_info = json.load(file)
         self.use_fixed_insights = use_fixed_insights
-        self.analysis_pool = self.load_insight_pool(file_path, use_fixed_insights)
         self.proposer = SolutionDesigner()
+        self.from_scratch = from_scratch
+
+    async def initialize(self):
+        if self.from_scratch:
+            self.insight_pool = await self.generate_solutions_from_scratch(self.dataset_info, self.state["task"])
+        else:
+            self.insight_pool = self.load_insight_pool(self.file_path, self.use_fixed_insights)
 
     @staticmethod
     def load_json_data(json_dir):
@@ -84,14 +95,14 @@ class InstructionGenerator:
             data.extend(fixed_insights)
         for item in data:
             if "task_id" not in item:
-                raise ValueError("task_id is not found in the analysis pool")
+                raise ValueError("task_id is not found in the insight_pool")
 
         if task_id:
             data = [item for item in data if int(item["task_id"]) == int(task_id)]
         return data
 
     async def generate_new_instructions(self, task_id, original_instruction, max_num, ext_info=None):
-        data = self.analysis_pool
+        data = self.insight_pool
         new_instructions = []
         if len(data) == 0:
             mcts_logger.log("MCTS", f"No insights available for task {task_id}")
@@ -108,6 +119,34 @@ class InstructionGenerator:
             new_instructions.append(new_instruction)
         return new_instructions
 
+    async def propose_new_insights(self, solution, score):
+        new_insights = await self.proposer.propose_insights(solution, score)
+        added_insights = self.add_insight(new_insights)
+        return added_insights
+
+    async def generate_solutions_from_scratch(self, dataset_info, dataset_name):
+        insight_pool = await self.proposer.generate_solutions(dataset_info, dataset_name, save_analysis_pool=False)
+        return insight_pool
+
+    def add_insight(self, new_insights):
+        added_insights = []
+        for new_insight in new_insights:
+            if not self.is_similar_to_existing(new_insight):
+                added_insights.append(new_insight)
+                self.insight_pool.append(new_insight)
+        return added_insights
+
+    def is_similar_to_existing(self, new_insight, similarity_threshold=0.8):
+        for existing_insight in self.insight_pool:
+            similarity = self.calculate_similarity(new_insight["Analysis"], existing_insight["Analysis"])
+            if similarity > similarity_threshold:
+                return True
+        return False
+
+    @staticmethod
+    def calculate_similarity(text1, text2):
+        return SequenceMatcher(None, text1, text2).ratio()
+
     @staticmethod
     async def generate_new_instruction(original_instruction, insights, ext_info):
         prompt = CHANGE_INSTRUCTION.format(instruction=original_instruction, insights=insights)
diff --git a/expo/insights/solution_designer.py b/expo/insights/solution_designer.py
index 9968131ca..2336911db 100644
--- a/expo/insights/solution_designer.py
+++ b/expo/insights/solution_designer.py
@@ -70,6 +70,45 @@ Your model choices should be advanced enough to be helpful.
 ```
 """
 
+
+INSIGHT_PROPOSAL_PROMPT = """
+You are an AI assistant tasked with analyzing a machine learning solution and proposing new insights to improve its performance. Given the current solution code and development score, suggest innovative approaches to enhance the model.
+
+Current Solution Code:
+{solution_code}
+
+Development Score: {dev_score}
+
+Based on this information, propose 3-5 new insights across different aspects of the machine learning pipeline (Data Preprocessing, Feature Engineering, and Model Training). Your insights should be specific, actionable, and have the potential to improve the model's performance.
+
+Please format your response as a JSON array with the following structure:
+[
+
+    {{
+        "task_type": "Data Preprocessing",
+        "insights": [
+            "insight1",
+            "insight2"
+        ]
+    }},
+    {{
+        "task_type": "Feature Engineering",
+        "insights": [
+            "insight1",
+            "insight2"
+        ]
+    }},
+    {{
+        "task_type": "Model Training",
+        "insights": [
+            "insight1",
+            "insight2"
+        ]
+    }}
+]
+"""
+
+
 KEY_DATASET_FEATURES = [
     "NumberOfClasses",
     "NumberOfFeatures",
@@ -86,7 +125,7 @@ TASK_TO_ID = {"EDA": 1, "Data Preprocessing": 2, "Feature Engineering": 3, "Mode
 class SolutionDesigner:
     data_dir: str = DATA_CONFIG["datasets_dir"]
 
-    async def generate_solutions(self, dataset_info, dataset_name):
+    async def generate_solutions(self, dataset_info, dataset_name, save_analysis_pool=True):
         llm = LLM()
         context = DATASET_INSIGHT_PROMPT.format(
             dataset=dataset_info["description"],
@@ -96,8 +135,18 @@ class SolutionDesigner:
         rsp = await llm.aask(context)
         rsp = clean_json_from_rsp(rsp)
         analysis_pool = self.process_analysis_pool(json.loads(rsp))
-        dataset_path = f"{self.data_dir}/{dataset_name}"
-        self.save_analysis_pool(dataset_path, analysis_pool)
+        if save_analysis_pool:
+            dataset_path = f"{self.data_dir}/{dataset_name}"
+            self.save_analysis_pool(dataset_path, analysis_pool)
+        return analysis_pool
+
+    async def propose_new_insights(self, solution, score):
+        llm = LLM()
+        context = INSIGHT_PROPOSAL_PROMPT.format(solution_code=solution, dev_score=score)
+        rsp = await llm.aask(context)
+        rsp = clean_json_from_rsp(rsp)
+        new_insights = self.process_analysis_pool(json.loads(rsp))
+        return new_insights
 
     def process_analysis_pool(self, insights_rsp):
         analysis_pool = []
diff --git a/expo/research_assistant.py b/expo/research_assistant.py
index fb34ece38..0b53521a3 100644
--- a/expo/research_assistant.py
+++ b/expo/research_assistant.py
@@ -139,6 +139,11 @@ class ResearchAssistant(DataInterpreter):
             save_notebook(role=self, save_dir=self.role_dir, name=self.get_node_name())
         return task_result
 
+    def get_solution(self):
+        codes = [task.code for task in self.planner.plan.tasks]
+        results = [task.result for task in self.planner.plan.tasks]
+        return {"codes": codes, "results": results}
+
     def save_state(self, static_save=False):
         """
         attribute:
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index 15be27d60..c43da12fd 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -32,6 +32,9 @@ def get_mcts_args(parser):
     parser.add_argument("--rollouts", type=int, default=5)
     parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")
     parser.add_argument("--start_task_id", type=int, default=2)
+    parser.add_argument(
+        "--from_scratch", dest="from_scratch", action="store_true", help="Generate solutions from scratch"
+    )
 
 
 def get_aug_exp_args(parser):

From 3a57060e25a8acfd2ed0f80b4d68a5a110425159 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Sat, 12 Oct 2024 17:16:51 +0800
Subject: [PATCH 02/22] 1. add eval_func for sela and compatibility to others
 2. llm extract score (use all code block and execution results) 3. add
 argument for custom dataset dir 4. dataset custom requirement support

---
 expo/MCTS.py                                | 65 ++++++++++++++-------
 expo/data/custom_task.py                    | 38 ++++++++++++
 expo/data/dataset.py                        |  2 +-
 expo/evaluation/evaluation.py               | 12 ++++
 expo/experimenter/experimenter.py           |  5 +-
 expo/experimenter/mcts.py                   | 21 ++++++-
 expo/experimenter/mle_bench/instructions.py | 47 +++++++++++++++
 expo/insights/instruction_generator.py      | 14 +++--
 expo/research_assistant.py                  | 28 ++++++---
 expo/run_experiment.py                      |  6 ++
 expo/utils.py                               |  2 +
 11 files changed, 202 insertions(+), 38 deletions(-)
 create mode 100644 expo/data/custom_task.py
 create mode 100644 expo/experimenter/mle_bench/instructions.py

diff --git a/expo/MCTS.py b/expo/MCTS.py
index 7e1d7c88a..a8410748e 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -3,10 +3,12 @@ import math
 import os
 import pickle
 import random
+import shutil
 
 import numpy as np
 import pandas as pd
 
+from expo.data.custom_task import get_mle_bench_requirements
 from expo.data.dataset import generate_task_requirement, get_split_dataset_path
 from expo.evaluation.evaluation import evaluate_score
 from expo.insights.instruction_generator import InstructionGenerator
@@ -17,9 +19,6 @@ from metagpt.utils.common import read_json_file
 
 
 def initialize_di_root_node(state, reflection: bool = True):
-    # state = create_initial_state(
-    #     task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
-    # )
     role = ResearchAssistant(
         node_id="0", start_task_id=state["start_task_id"], use_reflection=reflection, role_dir=state["node_dir"]
     )
@@ -29,20 +28,33 @@ def initialize_di_root_node(state, reflection: bool = True):
 def create_initial_state(
     task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str, args
 ):
+    external_eval = args.external_eval
+
+    if args.custom_dataset_dir:
+        dataset_config = None
+        datasets_dir = args.custom_dataset_dir
+        requirement = get_mle_bench_requirements(args.custom_dataset_dir, data_config)
+        exp_pool_path = None
+    else:
+        dataset_config = data_config["datasets"][task]
+        datasets_dir = get_split_dataset_path(task, data_config)
+        requirement = generate_task_requirement(task, data_config, is_di=True, special_instruction=special_instruction)
+        exp_pool_path = get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool")
+
     initial_state = {
         "task": task,
         "work_dir": data_config["work_dir"],
         "node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{name}"),
-        "dataset_config": data_config["datasets"][task],
-        "datasets_dir": get_split_dataset_path(task, data_config),
-        "exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"),
-        "requirement": generate_task_requirement(
-            task, data_config, is_di=True, special_instruction=special_instruction
-        ),
+        "dataset_config": dataset_config,
+        "datasets_dir": datasets_dir,  # won't be used if external eval is used
+        "exp_pool_path": exp_pool_path,
+        "requirement": requirement,
         "has_run": False,
         "start_task_id": start_task_id,
         "low_is_better": low_is_better,
         "role_timeout": args.role_timeout,
+        "external_eval": external_eval,
+        "custom_dataset_dir": args.custom_dataset_dir,
     }
     os.makedirs(initial_state["node_dir"], exist_ok=True)
     return initial_state
@@ -173,22 +185,34 @@ class Node:
             node.save_new_role(new_role)
             self.add_child(node)
 
-    def evaluate_prediction(self, split):
-        pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
-        pred_node_path = os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
+    def get_predictions_path(self, split):
+        return os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
+
+    def get_and_move_predictions(self, split):
+        if not os.path.exists(self.get_predictions_path(split)):
+            pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
+            shutil.copy(pred_path, self.get_predictions_path(split))
+            os.remove(pred_path)
+        return pd.read_csv(self.get_predictions_path(split))
+
+    def get_gt(self, split):
         gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
-        preds = pd.read_csv(pred_path)["target"]
-        preds.to_csv(pred_node_path, index=False)
-        gt = pd.read_csv(gt_path)["target"]
+        return pd.read_csv(gt_path)
+
+    def evaluate_prediction(self, split):
+        preds = self.get_and_move_predictions(split)["target"]
+        gt = self.get_gt(split)["target"]
         metric = self.state["dataset_config"]["metric"]
-        # remove original predictions.csv
-        os.remove(pred_path)
         return evaluate_score(preds, gt, metric)
 
     def evaluate_simulation(self, score_dict):
-        scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
-        scores["score"] = scores["dev_score"]
-        score_dict.update(scores)
+        if self.state["external_eval"]:  # use external evaluation
+            scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
+            scores["score"] = scores["dev_score"]
+            score_dict.update(scores)
+        else:
+            self.get_and_move_predictions("dev")
+            self.get_and_move_predictions("test")
         return score_dict
 
     async def run_node(self, role=None):
@@ -215,7 +239,6 @@ class Node:
                 mcts_logger.log("MCTS", f"Role-level timeout: {e}")
                 break
             except Exception as e:
-                print(f"Error: {e}")
                 mcts_logger.log("MCTS", f"Error in running the role: {e}")
                 num_runs += 1
 
diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py
new file mode 100644
index 000000000..2bd88abde
--- /dev/null
+++ b/expo/data/custom_task.py
@@ -0,0 +1,38 @@
+import os
+
+from expo.experimenter.mle_bench.instructions import (
+    ADDITIONAL_NOTES,
+    INSTRUCTIONS,
+    INSTRUCTIONS_OBFUSCATED,
+)
+
+MLE_BENCH_FILES = ["description.md", "description_obfuscated.md"]
+
+
+MLE_REQUIREMENTS = """
+{instructions}
+
+{additonal_notes}
+
+COMPETITION INSTRUCTIONS
+------
+
+{task_description}
+
+"""
+
+
+def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
+    if obfuscated:
+        instructions = INSTRUCTIONS_OBFUSCATED
+        task_file = "description_obfuscated.md"
+    else:
+        instructions = INSTRUCTIONS
+        task_file = "description.md"
+
+    with open(os.path.join(dataset_dir, task_file)) as f:
+        task_description = f.read()
+    mle_requirement = MLE_REQUIREMENTS.format(
+        instructions=instructions, additonal_notes=ADDITIONAL_NOTES, task_description=task_description
+    )
+    return mle_requirement
diff --git a/expo/data/dataset.py b/expo/data/dataset.py
index e076284d6..8b0c5b980 100644
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@@ -268,7 +268,7 @@ class ExpDataset:
         dataset_info = self.get_dataset_info()
         num_classes = dataset_info["metadata"]["NumberOfClasses"]
         if num_classes == 2:
-            metric = "f1"
+            metric = "f1 binary"
         elif 2 < num_classes <= 200:
             metric = "f1 weighted"
         elif num_classes > 200 or num_classes == 0:
diff --git a/expo/evaluation/evaluation.py b/expo/evaluation/evaluation.py
index 16b3acb71..1ba7fa60f 100644
--- a/expo/evaluation/evaluation.py
+++ b/expo/evaluation/evaluation.py
@@ -22,3 +22,15 @@ def evaluate_score(pred, gt, metric):
         return mean_squared_error(np.log1p(gt), np.log1p(pred), squared=False)
     else:
         raise ValueError(f"Metric {metric} not supported")
+
+
+def node_evaluate_score_sela(node):
+    preds = node.get_and_move_predictions("test")["target"]
+    gt = node.get_gt("test")["target"]
+    metric = node.state["dataset_config"]["metric"]
+    return evaluate_score(preds, gt, metric)
+
+
+def node_evaluate_score_mlebench(node):
+    # TODO
+    return 0
diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py
index 9aa879e24..417adabad 100644
--- a/expo/experimenter/experimenter.py
+++ b/expo/experimenter/experimenter.py
@@ -43,7 +43,10 @@ class Experimenter:
             except Exception as e:
                 print(f"Error: {e}")
                 num_runs += 1
-        save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
+        # save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
+        save_name = self.get_save_name()
+        save_notebook(role=di, save_dir=self.result_path, name=f"{save_name}_{run_idx}")
+
         if not run_finished:
             score_dict = {"train_score": -1, "dev_score": -1, "test_score": -1, "score": -1}
         return score_dict
diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py
index d212eb204..37fc7a071 100644
--- a/expo/experimenter/mcts.py
+++ b/expo/experimenter/mcts.py
@@ -1,5 +1,9 @@
 import shutil
 
+from expo.evaluation.evaluation import (
+    node_evaluate_score_mlebench,
+    node_evaluate_score_sela,
+)
 from expo.evaluation.visualize_mcts import get_tree_text
 from expo.experimenter.experimenter import Experimenter
 from expo.Greedy import Greedy, Random
@@ -14,25 +18,35 @@ class MCTSExperimenter(Experimenter):
             self.start_task_id = 1  # start from datapreprocessing if it is image task
         else:
             self.start_task_id = args.start_task_id
+
+        if args.eval_func == "sela":
+            self.eval_func = node_evaluate_score_sela
+        elif args.eval_func == "mlebench":
+            self.eval_func = node_evaluate_score_mlebench
+
         super().__init__(args, **kwargs)
         self.tree_mode = tree_mode
 
     async def run_experiment(self):
+        use_fixed_insights = self.args.use_fixed_insights
+        depth = 5
         if self.tree_mode == "greedy":
-            mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
+            mcts = Greedy(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
         elif self.tree_mode == "random":
-            mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
+            mcts = Random(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
         else:
-            mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
+            mcts = MCTS(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
         best_nodes = await mcts.search(state=self.state, args=self.args)
         best_node = best_nodes["global_best"]
         dev_best_node = best_nodes["dev_best"]
         score_dict = best_nodes["scores"]
+        additional_scores = {"grader": self.eval_func(dev_best_node)}
 
         text, num_generated_codes = get_tree_text(mcts.root_node)
         text += f"Generated {num_generated_codes} unique codes.\n"
         text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
         text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
+        text += f"Grader score: {additional_scores['grader']}\n"
         print(text)
         results = [
             {
@@ -45,6 +59,7 @@ class MCTSExperimenter(Experimenter):
                 "tree_text": text,
                 "args": vars(self.args),
                 "scores": score_dict,
+                "additional_scores": additional_scores,
             }
         ]
         self.save_result(results)
diff --git a/expo/experimenter/mle_bench/instructions.py b/expo/experimenter/mle_bench/instructions.py
new file mode 100644
index 000000000..301b465c1
--- /dev/null
+++ b/expo/experimenter/mle_bench/instructions.py
@@ -0,0 +1,47 @@
+INSTRUCTIONS = """
+You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+
+- **Competition-specific instructions**: The instructions for the competition are available in `/home/data/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
+- **Dataset**: The dataset files for the competition are available in `/home/data/`.
+- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task. 
+
+Wherever there is a conflict between the points above and the competition-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+"""
+
+INSTRUCTIONS_OBFUSCATED = """
+You are participating in *MLE-bench*, a benchmark designed to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+
+- **Task-specific instructions**: The instructions for the task are available in `/home/data/description.md`. Read this file to understand the problem setup for the task you are currently solving.
+- **Dataset**: The dataset files for the task are available in `/home/data/`.
+- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task. 
+
+Wherever there is a conflict between the points above and the task-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+"""
+
+ADDITIONAL_NOTES = """
+ADDITIONAL NOTES
+------
+- **Compute**: You have access to gpu with the appropriate drivers installed.
+"""
diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py
index 7fa4d72ea..7fe5ceece 100644
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@@ -37,12 +37,18 @@ class InstructionGenerator:
     def __init__(self, state, use_fixed_insights, from_scratch):
         self.state = state
         self.file_path = state["exp_pool_path"]
-        self.dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
-        with open(self.dataset_info_path, "r") as file:
-            self.dataset_info = json.load(file)
+        if state["custom_dataset_dir"]:
+            self.dataset_info = "xxx"
+        else:
+            dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
+            with open(dataset_info_path, "r") as file:
+                self.dataset_info = json.load(file)
         self.use_fixed_insights = use_fixed_insights
         self.proposer = SolutionDesigner()
-        self.from_scratch = from_scratch
+        if self.file_path is None:
+            self.from_scratch = True
+        else:
+            self.from_scratch = from_scratch
 
     async def initialize(self):
         if self.from_scratch:
diff --git a/expo/research_assistant.py b/expo/research_assistant.py
index 0b53521a3..d068dd4e5 100644
--- a/expo/research_assistant.py
+++ b/expo/research_assistant.py
@@ -13,15 +13,19 @@ from metagpt.roles.di.data_interpreter import DataInterpreter
 from metagpt.schema import Message, Task, TaskResult
 from metagpt.utils.common import CodeParser, write_json_file
 
-EXTRACT_SCORE_PROMPT = """
-# Code:
+CODE_BLOCK_RESULT = """
+## Code:
 {code}
 
-# Execution Result:
+## Execution Result:
 {result}
+"""
 
+EXTRACT_SCORE_PROMPT = """
+# Code Blocks
+{code_block}
 # Instruction:
-Based on the code and execution result, please extract the scores and return it as a dictionary.
+Based on the code and execution result, please extract the **final scores** and return it as a dictionary.
 If you cannot find the scores, please still return a dictionary with the keys 'train_score', 'dev_score', and 'test_score', and set the values to -1.
 
 # Format:
@@ -109,9 +113,17 @@ class ResearchAssistant(DataInterpreter):
         return score_dict
 
     async def llm_extract_score(self):
-        result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
-        code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
-        rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code=code_text, result=result_text, role="user"))
+        # result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
+        # code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
+        num_tasks = len(self.planner.plan.task_map)
+        task_map = self.planner.plan.task_map
+        code_block = "\n".join(
+            [
+                CODE_BLOCK_RESULT.format(code=task_map[str(i + 1)].code, result=task_map[str(i + 1)].result)
+                for i in range(num_tasks)
+            ]
+        )
+        rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code_block=code_block, role="user"))
         json_block = CodeParser.parse_code(block=None, text=rsp)
         score_dict = json.loads(json_block)
         return score_dict
@@ -161,7 +173,7 @@ class ResearchAssistant(DataInterpreter):
         stg_path = self.role_dir
         name = self.get_node_name()
         role_path = os.path.join(stg_path, f"{name}.json")
-        # 将状态保存为 JSON 文件
+        # save state as json file
         write_json_file(role_path, self.model_dump())
 
     def remap_tasks(self):
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index c43da12fd..53fcdd18c 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -31,10 +31,16 @@ def get_mcts_args(parser):
     parser.set_defaults(load_tree=False)
     parser.add_argument("--rollouts", type=int, default=5)
     parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")
+    parser.set_defaults(use_fixed_insights=False)
     parser.add_argument("--start_task_id", type=int, default=2)
     parser.add_argument(
         "--from_scratch", dest="from_scratch", action="store_true", help="Generate solutions from scratch"
     )
+    parser.set_defaults(from_scratch=False)
+    parser.add_argument("--no_external_eval", dest="external_eval", action="store_false")
+    parser.set_defaults(external_eval=True)
+    parser.add_argument("--eval_func", type=str, default="sela", choices=["sela", "mlebench"])
+    parser.add_argument("--custom_dataset_dir", type=str, default=None)
 
 
 def get_aug_exp_args(parser):
diff --git a/expo/utils.py b/expo/utils.py
index b022879b0..f3381c91c 100644
--- a/expo/utils.py
+++ b/expo/utils.py
@@ -51,6 +51,8 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"):
             f"Dataset {task_name} not found in config file. Available datasets: {data_config['datasets'].keys()}"
         )
     exp_pool_path = os.path.join(data_path, f"{pool_name}.json")
+    if not os.path.exists(exp_pool_path):
+        return None
     return exp_pool_path
 
 

From a91003a7fe2235609d99e74d6d4a93402fb61fc4 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Mon, 14 Oct 2024 09:56:55 +0800
Subject: [PATCH 03/22] disable submission

---
 expo/data/custom_task.py                    | 22 +++++++++++++---
 expo/experimenter/mle_bench/instructions.py | 29 +++++++++++----------
 expo/insights/instruction_generator.py      |  3 ++-
 expo/insights/solution_designer.py          | 26 +++++++++++++-----
 4 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py
index 2bd88abde..14eb6aac2 100644
--- a/expo/data/custom_task.py
+++ b/expo/data/custom_task.py
@@ -19,20 +19,34 @@ COMPETITION INSTRUCTIONS
 
 {task_description}
 
+## More Instructions
+- output_dir: {output_dir}
+- Besides `submission.csv`, you should also save your output in the output directory.
+- Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory.
+
+Do not make visualizations.
 """
 
 
 def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
+    work_dir = data_config["work_dir"]
+    output_dir = f"{work_dir}/output"
+    os.makedirs(output_dir, exist_ok=True)
+
     if obfuscated:
-        instructions = INSTRUCTIONS_OBFUSCATED
+        instructions = INSTRUCTIONS_OBFUSCATED.format(dataset_dir=dataset_dir, output_dir=output_dir)
         task_file = "description_obfuscated.md"
     else:
-        instructions = INSTRUCTIONS
+        instructions = INSTRUCTIONS.format(dataset_dir=dataset_dir, output_dir=output_dir)
         task_file = "description.md"
 
-    with open(os.path.join(dataset_dir, task_file)) as f:
+    with open(os.path.join(dataset_dir, task_file), encoding="utf-8") as f:
         task_description = f.read()
     mle_requirement = MLE_REQUIREMENTS.format(
-        instructions=instructions, additonal_notes=ADDITIONAL_NOTES, task_description=task_description
+        instructions=instructions,
+        additonal_notes=ADDITIONAL_NOTES,
+        task_description=task_description,
+        output_dir=output_dir,
     )
+    print(mle_requirement)
     return mle_requirement
diff --git a/expo/experimenter/mle_bench/instructions.py b/expo/experimenter/mle_bench/instructions.py
index 301b465c1..631f410dc 100644
--- a/expo/experimenter/mle_bench/instructions.py
+++ b/expo/experimenter/mle_bench/instructions.py
@@ -1,3 +1,6 @@
+SUBMISSION_INSTURCTION = """
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+"""
 INSTRUCTIONS = """
 You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself.
 
@@ -5,18 +8,17 @@ You will be given a machine learning task. You must solve the task by training a
 
 BENCHMARK INSTRUCTIONS
 ------
-First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `expo.experimenter.mle_bench.instructions.py`.
 
-- **Competition-specific instructions**: The instructions for the competition are available in `/home/data/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
-- **Dataset**: The dataset files for the competition are available in `/home/data/`.
-- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
-- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
-- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Competition-specific instructions**: The instructions for the competition are available in `{dataset_dir}/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
+- **Dataset**: The dataset files for the competition are available in `{dataset_dir}`.
+- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `{output_dir}/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `{dataset_dir}` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
 - **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified.
 
 Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task. 
 
-Wherever there is a conflict between the points above and the competition-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+Wherever there is a conflict between the points above and the competition-specific instructions in `{dataset_dir}/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
 """
 
 INSTRUCTIONS_OBFUSCATED = """
@@ -26,18 +28,17 @@ You will be given a machine learning task. You must solve the task by training a
 
 BENCHMARK INSTRUCTIONS
 ------
-First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `expo.experimenter.mle_bench.instructions.py`.
 
-- **Task-specific instructions**: The instructions for the task are available in `/home/data/description.md`. Read this file to understand the problem setup for the task you are currently solving.
-- **Dataset**: The dataset files for the task are available in `/home/data/`.
-- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
-- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
-- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Task-specific instructions**: The instructions for the task are available in `{dataset_dir}/description.md`. Read this file to understand the problem setup for the task you are currently solving.
+- **Dataset**: The dataset files for the task are available in `{dataset_dir}/`.
+- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `{output_dir}/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `{dataset_dir}` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
 - **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified.
 
 Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task. 
 
-Wherever there is a conflict between the points above and the task-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+Wherever there is a conflict between the points above and the task-specific instructions in `{dataset_dir}/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
 """
 
 ADDITIONAL_NOTES = """
diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py
index 7fe5ceece..835c1ff9d 100644
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@@ -38,7 +38,8 @@ class InstructionGenerator:
         self.state = state
         self.file_path = state["exp_pool_path"]
         if state["custom_dataset_dir"]:
-            self.dataset_info = "xxx"
+            with open(f"{state['custom_dataset_dir']}/description.md", "r", encoding="utf-8") as file:
+                self.dataset_info = file.read()
         else:
             dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
             with open(dataset_info_path, "r") as file:
diff --git a/expo/insights/solution_designer.py b/expo/insights/solution_designer.py
index 2336911db..262caa0f6 100644
--- a/expo/insights/solution_designer.py
+++ b/expo/insights/solution_designer.py
@@ -5,7 +5,8 @@ from metagpt.llm import LLM
 
 DATA_CONFIG = load_data_config()
 
-DATASET_INSIGHT_PROMPT = """
+
+DATASET_DESCRIPTION_SELA_PROMPT = """
 # Dataset Description
 {dataset}
 
@@ -14,6 +15,15 @@ DATASET_INSIGHT_PROMPT = """
 
 # Dataset Head
 {head}
+"""
+
+DATASET_DESCRIPTION_CUSTOM_PROMPT = """
+# Dataset Description
+{dataset_description}
+"""
+
+DATASET_INSIGHT_PROMPT = """
+{description}
 
 # Instruction
 Propose insights to help improve the performance of the model on this dataset.
@@ -127,11 +137,15 @@ class SolutionDesigner:
 
     async def generate_solutions(self, dataset_info, dataset_name, save_analysis_pool=True):
         llm = LLM()
-        context = DATASET_INSIGHT_PROMPT.format(
-            dataset=dataset_info["description"],
-            metadata=self.metadata_builder(dataset_info["metadata"]),
-            head=dataset_info["df_head"],
-        )
+        if type(dataset_info) == dict:
+            description_prompt = DATASET_DESCRIPTION_SELA_PROMPT.format(
+                dataset=dataset_info["description"],
+                metadata=self.metadata_builder(dataset_info["metadata"]),
+                head=dataset_info["df_head"],
+            )
+        else:
+            description_prompt = DATASET_DESCRIPTION_CUSTOM_PROMPT.format(dataset_description=dataset_info)
+        context = DATASET_INSIGHT_PROMPT.format(description=description_prompt)
         rsp = await llm.aask(context)
         rsp = clean_json_from_rsp(rsp)
         analysis_pool = self.process_analysis_pool(json.loads(rsp))

From 1d4a84512039a3d2a29714f640bac08302d004cf Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Mon, 14 Oct 2024 16:12:26 +0800
Subject: [PATCH 04/22] =?UTF-8?q?=E6=94=AF=E6=8C=81=E8=B7=91=E9=80=9Amle?=
 =?UTF-8?q?=20bench?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 expo/MCTS.py                  |  6 ++++--
 expo/data/custom_task.py      | 17 ++++++++++++-----
 expo/evaluation/evaluation.py | 14 +++++++++++++-
 expo/run_experiment.py        |  5 +++++
 4 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/expo/MCTS.py b/expo/MCTS.py
index a8410748e..749850dd6 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -8,7 +8,7 @@ import shutil
 import numpy as np
 import pandas as pd
 
-from expo.data.custom_task import get_mle_bench_requirements
+from expo.data.custom_task import get_mle_bench_requirements, get_mle_task_id
 from expo.data.dataset import generate_task_requirement, get_split_dataset_path
 from expo.evaluation.evaluation import evaluate_score
 from expo.insights.instruction_generator import InstructionGenerator
@@ -35,6 +35,8 @@ def create_initial_state(
         datasets_dir = args.custom_dataset_dir
         requirement = get_mle_bench_requirements(args.custom_dataset_dir, data_config)
         exp_pool_path = None
+        # external_eval = False # make sure external eval is false if custom dataset is used
+        task = get_mle_task_id(args.custom_dataset_dir)
     else:
         dataset_config = data_config["datasets"][task]
         datasets_dir = get_split_dataset_path(task, data_config)
@@ -120,7 +122,7 @@ class Node:
             return f"{self.parent.id}-{num_sibling}"
 
     def is_terminal(self):
-        return int(self.state["start_task_id"]) == self.max_depth + 1
+        return int(self.state["start_task_id"]) == self.max_depth + 1  # TODO: Check if this is correct or +1
 
     def is_fully_expanded(self):
         return len(self.children) > 0
diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py
index 14eb6aac2..f66b4aa58 100644
--- a/expo/data/custom_task.py
+++ b/expo/data/custom_task.py
@@ -22,19 +22,26 @@ COMPETITION INSTRUCTIONS
 ## More Instructions
 - output_dir: {output_dir}
 - Besides `submission.csv`, you should also save your output in the output directory.
-- Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory.
-
-Do not make visualizations.
+- You should split the training data into train and dev set.
+- Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. They should be in the same format as the `submission.csv`.
+- Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. 
+**Do not make any plots or visualizations.**
 """
 
 
+def get_mle_task_id(dataset_dir):
+    return dataset_dir.split("/")[-3]
+
+
 def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
     work_dir = data_config["work_dir"]
-    output_dir = f"{work_dir}/output"
+    task = get_mle_task_id(dataset_dir)
+    output_dir = f"{work_dir}/{task}"
+    final_output_dir = f"{work_dir}/submission"
     os.makedirs(output_dir, exist_ok=True)
 
     if obfuscated:
-        instructions = INSTRUCTIONS_OBFUSCATED.format(dataset_dir=dataset_dir, output_dir=output_dir)
+        instructions = INSTRUCTIONS_OBFUSCATED.format(dataset_dir=dataset_dir, output_dir=final_output_dir)
         task_file = "description_obfuscated.md"
     else:
         instructions = INSTRUCTIONS.format(dataset_dir=dataset_dir, output_dir=output_dir)
diff --git a/expo/evaluation/evaluation.py b/expo/evaluation/evaluation.py
index 1ba7fa60f..2c19b81fc 100644
--- a/expo/evaluation/evaluation.py
+++ b/expo/evaluation/evaluation.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import numpy as np
 from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, roc_auc_score
 
@@ -33,4 +35,14 @@ def node_evaluate_score_sela(node):
 
 def node_evaluate_score_mlebench(node):
     # TODO
-    return 0
+    from mlebench.grade import grade_csv
+    from mlebench.registry import registry
+
+    competition_id = node.state["task"]
+    pred_path = node.get_predictions_path("test")
+    new_registry = registry.set_data_dir(Path(registry.get_data_dir()))
+    competition = new_registry.get_competition(competition_id)
+    submission = Path(pred_path)
+    report = grade_csv(submission, competition).to_dict()
+    report["submission_path"] = str(submission)
+    return report
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index 53fcdd18c..bf90cb07a 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -60,6 +60,11 @@ def get_di_args(parser):
 
 
 async def main(args):
+    if args.custom_dataset_dir:
+        args.external_eval = False
+        args.eval_func = "mlebench"
+        args.from_scratch = True
+
     if args.exp_mode == "mcts":
         experimenter = MCTSExperimenter(args)
     elif args.exp_mode == "greedy":

From 07800be4417c265c0b2ebe76f30bf1b2a7dd20c1 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Tue, 15 Oct 2024 14:13:25 +0800
Subject: [PATCH 05/22] allow datasets to be prepared by users

---
 expo/data/dataset.py | 31 ++++++++++++++++++++++---------
 expo/data/hf_data.py | 12 +++++++++---
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/expo/data/dataset.py b/expo/data/dataset.py
index 8b0c5b980..91490dcd7 100644
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@@ -1,3 +1,4 @@
+import argparse
 import asyncio
 import json
 import os
@@ -18,22 +19,22 @@ Report {metric} on the eval data. Do not plot or make any visualizations.
 """
 
 USE_AG = """
-7. Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.
+- Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.
 """
 
 TEXT_MODALITY = """
-7. You could use models from transformers library for this text dataset.
-8. Use gpu if available for faster training.
+- You could use models from transformers library for this text dataset.
+- Use gpu if available for faster training.
 """
 
 IMAGE_MODALITY = """
-7. You could use models from transformers/torchvision library for this image dataset.
-8. Use gpu if available for faster training.
+- You could use models from transformers/torchvision library for this image dataset.
+- Use gpu if available for faster training.
 """
 
 STACKING = """
-7. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor.
-8. You could do some quick model prototyping to see which models work best and then use them in the ensemble. 
+- To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor.
+- You could do some quick model prototyping to see which models work best and then use them in the ensemble. 
 """
 
 
@@ -361,10 +362,22 @@ async def process_dataset(dataset, solution_designer: SolutionDesigner, save_ana
     datasets_dict["datasets"][dataset.name] = dataset_dict
 
 
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--force_update", action="store_true", help="Force update datasets")
+    parser.add_argument("--save_analysis_pool", action="store_true", help="Save analysis pool")
+    parser.add_argument(
+        "--no_save_analysis_pool", dest="save_analysis_pool", action="store_false", help="Do not save analysis pool"
+    )
+    parser.set_defaults(save_analysis_pool=True)
+    return parser.parse_args()
+
+
 if __name__ == "__main__":
     datasets_dir = DATA_CONFIG["datasets_dir"]
-    force_update = False
-    save_analysis_pool = True
+    args = parse_args()
+    force_update = args.force_update
+    save_analysis_pool = args.save_analysis_pool
     datasets_dict = {"datasets": {}}
     solution_designer = SolutionDesigner()
     for dataset_id in OPENML_DATASET_IDS:
diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py
index 133fbdfa6..a18517d49 100644
--- a/expo/data/hf_data.py
+++ b/expo/data/hf_data.py
@@ -7,7 +7,12 @@ import pandas as pd
 from datasets import load_dataset
 from PIL import Image
 
-from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml
+from expo.data.dataset import (
+    ExpDataset,
+    parse_args,
+    process_dataset,
+    save_datasets_dict_to_yaml,
+)
 from expo.insights.solution_designer import SolutionDesigner
 from expo.utils import DATA_CONFIG
 
@@ -116,8 +121,9 @@ class HFExpDataset(ExpDataset):
 
 if __name__ == "__main__":
     dataset_dir = DATA_CONFIG["datasets_dir"]
-    save_analysis_pool = True
-    force_update = False
+    args = parse_args()
+    force_update = args.force_update
+    save_analysis_pool = args.save_analysis_pool
     datasets_dict = {"datasets": {}}
     solution_designer = SolutionDesigner()
     for dataset_meta in HFDATSETS:

From d1799829493b66351054a5dcd0051507b50c394c Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Tue, 15 Oct 2024 14:14:29 +0800
Subject: [PATCH 06/22] allow special-instruction for mle-bench

---
 expo/MCTS.py             |  6 ++--
 expo/README.md           | 60 +++++++++++++++++++++++-----------------
 expo/data/custom_task.py |  7 +++--
 expo/utils.py            |  2 +-
 4 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/expo/MCTS.py b/expo/MCTS.py
index 749850dd6..378474b4e 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -33,7 +33,9 @@ def create_initial_state(
     if args.custom_dataset_dir:
         dataset_config = None
         datasets_dir = args.custom_dataset_dir
-        requirement = get_mle_bench_requirements(args.custom_dataset_dir, data_config)
+        requirement = get_mle_bench_requirements(
+            args.custom_dataset_dir, data_config, special_instruction=special_instruction
+        )
         exp_pool_path = None
         # external_eval = False # make sure external eval is false if custom dataset is used
         task = get_mle_task_id(args.custom_dataset_dir)
@@ -309,7 +311,7 @@ class MCTS:
             node = random.choice(node.children)
         reward, result_dict = await node.run_node(role)
         mcts_logger.log("MCTS", f"Simulated node's reward: {reward}")
-
+        # TODO: add new insights
         return reward
 
     def backpropagate(self, node: Node, reward):
diff --git a/expo/README.md b/expo/README.md
index 598de039d..5b913e415 100644
--- a/expo/README.md
+++ b/expo/README.md
@@ -6,7 +6,12 @@ # SELA: Tree-Search Enhanced LLM Agents for Automated Machine Learning
 ## 1. Data Preparation
 
 - Download Datasets：https://deepwisdom.feishu.cn/drive/folder/RVyofv9cvlvtxKdddt2cyn3BnTc?from=from_copylink
-
+- Download and prepare datasets from scratch:
+  ```
+  cd expo/data
+  python dataset.py --save_analysis_pool
+  python hf_data.py --save_analysis_pool
+  ```
 
 ## 2. Configs
 
@@ -85,6 +90,23 @@ ## 4. Evaluation
 
 - Use the function `evaluate_score` to evaluate.
 
+#### MLE-Bench
+**Note: mle-bench requires python 3.11 or higher**
+```
+git clone https://github.com/openai/mle-bench.git
+cd mle-bench
+pip install -e .
+```
+
+```
+mlebench prepare -c <competition-id> --data-dir <dataset-dir-save-path>
+```
+
+Enter the following command to run the experiment:
+```
+python run_experiment.py --exp_mode mcts --custom_dataset_dir <dataset-dir-save-path/prepared/public> --rollouts 10 --from_scratch
+```
+
 
 ## 5. Baselines
 ### DS Agent
@@ -92,7 +114,7 @@ ### DS Agent
 git clone https://github.com/guosyjlu/DS-Agent.git
 ```
 
-将其deployment/generate.py line46-48行部分修改如下（目的是用deepseek而非GPT的API）：
+Modify the following lines in deployment/generate.py (lines 46-48) as shown below (the purpose is to use deepseek instead of OpenAI's API):
 ```python
 messages = [{"role": "user", "content": prompt}]
 
@@ -120,7 +142,7 @@ ### DS Agent
 completion = raw_completion.split("```python")[1].split("```")[0]
 ```
 
-修改完后在新建一个`deployment/test.sh` 分别运行下列两行，`$TASK` 是你要测试的task name
+After making the changes, create a new `deployment/test.sh` and run the following two lines separately, where `$TASK` is the name of the task you want to test
 ```
 python -u generate.py --llm deepseek-coder --task $TASK --shot 1 --retrieval > "$TASK".txt 2>&1 
 
@@ -135,7 +157,7 @@ #### Setup
 git clone https://github.com/WecoAI/aideml.git
 ```
 
-修改 `aideml/aide/utils/config.yaml` 内容如下
+Modify `aideml/aide/utils/config.yaml`:
 
 ```yaml
 # path to the task data directory
@@ -192,14 +214,14 @@   # hyperparameters for the tree search
     num_drafts: 5
 ```
 
-由于 deepseek 完全兼容 OpenAI 的 API，修改`base_url`为`自己的url`，`api_key`为`自己的key`即可
+Since Deepseek is compatible to OpenAI's API, change `base_url` into `your own url`，`api_key` into `your api key`
 
 ```
-export OPENAI_API_KEY="自己的key"
-export OPENAI_BASE_URL="自己的url"
+export OPENAI_API_KEY="your api key"
+export OPENAI_BASE_URL="your own url"
 ```
 
-修改`aideml/aide/backend/__init__.py` 30 行内容如下：
+Modify `aideml/aide/backend/__init__.py`'s line 30 and below:
 
 ```python
 model_kwargs = model_kwargs | {
@@ -213,7 +235,7 @@   # hyperparameters for the tree search
         query_func = backend_openai.query
 ```
 
-由于 deepseekV2.5 不再支持 system message 使用 function call，修改 `aideml/aide/agent.py` 312 行内容如下：
+Since deepseekV2.5 no longer supports system message using function call, modify `aideml/aide/agent.py`'s line 312:
 
 ```python
 response = cast(
@@ -228,7 +250,7 @@   # hyperparameters for the tree search
         )
 ```
 
-修改完后
+Modify and install:
 
 ```
 cd aideml
@@ -237,8 +259,8 @@   # hyperparameters for the tree search
 
 #### Run
 
-运行下面脚本获取运行结果，在当前目录下将生成一个 log 文件夹以及 workspace 文件夹
-log 文件夹中将包含实验使用配置以及生成方案记录，workspace 文件夹下将保存 aide 最后生成的结果文件
+Run the following script to get the running results, a `log` folder and a `workspace` folder will be generated in the current directory
+The `log` folder will contain the experimental configuration and the generated scheme, and the `workspace` folder will save the final results generated by aide
 
 ```
 python experimenter/aide.py
@@ -264,7 +286,6 @@ #### Setup
 Replace {task_name} with the specific task you want to run.
 
 
-提供github链接，并说明使用的命令以及参数设置
 ### AutoSklearn
 #### System requirements
 auto-sklearn has the following system requirements:
@@ -295,15 +316,4 @@ ### Base DI
 For setup, check 4.
 - `python run_experiment.py --exp_mode base --task titanic --num_experiments 10`
 - Specifically instruct DI to use AutoGluon: `--special_instruction ag`
-- Specifically instruct DI to use the stacking ensemble method: `--special_instruction stacking`
-
-
-
-
-
-
-
-
-
-
-
+- Specifically instruct DI to use the stacking ensemble method: `--special_instruction stacking`
\ No newline at end of file
diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py
index f66b4aa58..e904e9496 100644
--- a/expo/data/custom_task.py
+++ b/expo/data/custom_task.py
@@ -1,5 +1,6 @@
 import os
 
+from expo.data.dataset import SPECIAL_INSTRUCTIONS
 from expo.experimenter.mle_bench.instructions import (
     ADDITIONAL_NOTES,
     INSTRUCTIONS,
@@ -24,7 +25,7 @@ COMPETITION INSTRUCTIONS
 - Besides `submission.csv`, you should also save your output in the output directory.
 - You should split the training data into train and dev set.
 - Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. They should be in the same format as the `submission.csv`.
-- Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. 
+- Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. {special_instruction}
 **Do not make any plots or visualizations.**
 """
 
@@ -33,12 +34,13 @@ def get_mle_task_id(dataset_dir):
     return dataset_dir.split("/")[-3]
 
 
-def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
+def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False, special_instruction=""):
     work_dir = data_config["work_dir"]
     task = get_mle_task_id(dataset_dir)
     output_dir = f"{work_dir}/{task}"
     final_output_dir = f"{work_dir}/submission"
     os.makedirs(output_dir, exist_ok=True)
+    special_instruction = SPECIAL_INSTRUCTIONS[special_instruction]
 
     if obfuscated:
         instructions = INSTRUCTIONS_OBFUSCATED.format(dataset_dir=dataset_dir, output_dir=final_output_dir)
@@ -54,6 +56,7 @@ def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
         additonal_notes=ADDITIONAL_NOTES,
         task_description=task_description,
         output_dir=output_dir,
+        special_instruction=special_instruction,
     )
     print(mle_requirement)
     return mle_requirement
diff --git a/expo/utils.py b/expo/utils.py
index f3381c91c..21b311e7f 100644
--- a/expo/utils.py
+++ b/expo/utils.py
@@ -111,7 +111,7 @@ async def load_execute_notebook(role):
     codes = [task.code for task in tasks if task.code]
     executor = role.execute_code
     executor.nb = nbformat.v4.new_notebook()
-    executor.nb_client = NotebookClient(executor.nb, timeout=executor.timeout)
+    executor.nb_client = NotebookClient(executor.nb, timeout=role.role_timeout)
     # await executor.build()
     for code in codes:
         outputs, success = await executor.run(code)

From 0166834ce47396efb8827859eb2e788ab0dd5f6f Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Tue, 15 Oct 2024 16:19:02 +0800
Subject: [PATCH 07/22] fix special instruction bug

---
 expo/data/custom_task.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py
index e904e9496..fe366d7ea 100644
--- a/expo/data/custom_task.py
+++ b/expo/data/custom_task.py
@@ -34,14 +34,16 @@ def get_mle_task_id(dataset_dir):
     return dataset_dir.split("/")[-3]
 
 
-def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False, special_instruction=""):
+def get_mle_bench_requirements(dataset_dir, data_config, special_instruction, obfuscated=False):
     work_dir = data_config["work_dir"]
     task = get_mle_task_id(dataset_dir)
     output_dir = f"{work_dir}/{task}"
     final_output_dir = f"{work_dir}/submission"
     os.makedirs(output_dir, exist_ok=True)
-    special_instruction = SPECIAL_INSTRUCTIONS[special_instruction]
-
+    if special_instruction:
+        special_instruction = SPECIAL_INSTRUCTIONS[special_instruction]
+    else:
+        special_instruction = ""
     if obfuscated:
         instructions = INSTRUCTIONS_OBFUSCATED.format(dataset_dir=dataset_dir, output_dir=final_output_dir)
         task_file = "description_obfuscated.md"

From 02b4f0aa13a238a8a053f4354048d76dbe99516b Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Tue, 15 Oct 2024 16:41:28 +0800
Subject: [PATCH 08/22] add timout to mlebench readme instruction

---
 expo/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/expo/README.md b/expo/README.md
index 5b913e415..6704582b8 100644
--- a/expo/README.md
+++ b/expo/README.md
@@ -104,7 +104,7 @@ #### MLE-Bench
 
 Enter the following command to run the experiment:
 ```
-python run_experiment.py --exp_mode mcts --custom_dataset_dir <dataset-dir-save-path/prepared/public> --rollouts 10 --from_scratch
+python run_experiment.py --exp_mode mcts --custom_dataset_dir <dataset-dir-save-path/prepared/public> --rollouts 10 --from_scratch --role_timeout 3600
 ```
 
 

From 541f8a1b100bce3c88e9a346e090ce86ab01f91f Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Tue, 15 Oct 2024 19:18:30 +0800
Subject: [PATCH 09/22] fix path bug

---
 expo/evaluation/evaluation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/expo/evaluation/evaluation.py b/expo/evaluation/evaluation.py
index 2c19b81fc..1e58e1725 100644
--- a/expo/evaluation/evaluation.py
+++ b/expo/evaluation/evaluation.py
@@ -39,8 +39,9 @@ def node_evaluate_score_mlebench(node):
     from mlebench.registry import registry
 
     competition_id = node.state["task"]
+    data_dir = Path(node.state["custom_dataset_dir"]).parent.parent.parent  # prepared/public/../../../
     pred_path = node.get_predictions_path("test")
-    new_registry = registry.set_data_dir(Path(registry.get_data_dir()))
+    new_registry = registry.set_data_dir(data_dir)
     competition = new_registry.get_competition(competition_id)
     submission = Path(pred_path)
     report = grade_csv(submission, competition).to_dict()

From 7794b99005ba84a8819c1564a7ad8e3d1e27c5b0 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Wed, 16 Oct 2024 09:50:24 +0800
Subject: [PATCH 10/22] fix: role timeout not passing in

---
 expo/MCTS.py           | 6 +++++-
 expo/run_experiment.py | 7 +++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/expo/MCTS.py b/expo/MCTS.py
index 378474b4e..cfb21a61c 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -20,7 +20,11 @@ from metagpt.utils.common import read_json_file
 
 def initialize_di_root_node(state, reflection: bool = True):
     role = ResearchAssistant(
-        node_id="0", start_task_id=state["start_task_id"], use_reflection=reflection, role_dir=state["node_dir"]
+        node_id="0",
+        start_task_id=state["start_task_id"],
+        use_reflection=reflection,
+        role_dir=state["node_dir"],
+        role_timeout=state["role_timeout"],
     )
     return role, Node(parent=None, state=state, action=None, value=0)
 
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index bf90cb07a..71529b955 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -9,7 +9,7 @@ from expo.experimenter.experimenter import Experimenter
 from expo.experimenter.mcts import MCTSExperimenter
 
 
-def get_args():
+def get_args(cmd=True):
     parser = argparse.ArgumentParser()
     parser.add_argument("--name", type=str, default="")
     parser.add_argument(
@@ -22,7 +22,10 @@ def get_args():
     get_di_args(parser)
     get_mcts_args(parser)
     get_aug_exp_args(parser)
-    return parser.parse_args()
+    if cmd:
+        return parser.parse_args()
+    else:
+        return parser.parse_args("")
 
 
 def get_mcts_args(parser):

From 989a3b4299c4f2520b7fc6893529398b4826b98f Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Wed, 16 Oct 2024 10:53:37 +0800
Subject: [PATCH 11/22] allow max depth passing in

---
 expo/MCTS.py              | 2 +-
 expo/experimenter/mcts.py | 2 +-
 expo/run_experiment.py    | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/expo/MCTS.py b/expo/MCTS.py
index cfb21a61c..1eb8a131c 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -274,7 +274,7 @@ class MCTS:
     # data_path
     root_node: Node = None
     children: dict = {}
-    max_depth: int = 5
+    max_depth: int = None
     c_explore: float = 1.4
     c_unvisited: float = 0.8
     node_order: list = []
diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py
index 37fc7a071..a42566366 100644
--- a/expo/experimenter/mcts.py
+++ b/expo/experimenter/mcts.py
@@ -29,7 +29,7 @@ class MCTSExperimenter(Experimenter):
 
     async def run_experiment(self):
         use_fixed_insights = self.args.use_fixed_insights
-        depth = 5
+        depth = self.args.max_depth
         if self.tree_mode == "greedy":
             mcts = Greedy(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
         elif self.tree_mode == "random":
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index 71529b955..4e6b41fd7 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -44,6 +44,7 @@ def get_mcts_args(parser):
     parser.set_defaults(external_eval=True)
     parser.add_argument("--eval_func", type=str, default="sela", choices=["sela", "mlebench"])
     parser.add_argument("--custom_dataset_dir", type=str, default=None)
+    parser.add_argument("--max_depth", type=int, default=4)
 
 
 def get_aug_exp_args(parser):

From df7a04dd1993f3fa7266afb2719d2995b080ef51 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Wed, 16 Oct 2024 14:28:27 +0800
Subject: [PATCH 12/22] output dev set score

---
 expo/data/custom_task.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py
index fe366d7ea..c4dd0012c 100644
--- a/expo/data/custom_task.py
+++ b/expo/data/custom_task.py
@@ -24,6 +24,7 @@ COMPETITION INSTRUCTIONS
 - output_dir: {output_dir}
 - Besides `submission.csv`, you should also save your output in the output directory.
 - You should split the training data into train and dev set.
+- You should use the dev set to improve your model. Print the final dev set score after training.
 - Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. They should be in the same format as the `submission.csv`.
 - Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. {special_instruction}
 **Do not make any plots or visualizations.**

From 38daf24c33f9e89dea3fa53e772924cd0daae16c Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 09:55:37 +0800
Subject: [PATCH 13/22] rename task if custom_data_dir is used

---
 expo/run_experiment.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index 4e6b41fd7..c977b4dc9 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -1,6 +1,7 @@
 import argparse
 import asyncio
 
+from expo.data.custom_task import get_mle_task_id
 from expo.experimenter.aug import AugExperimenter
 from expo.experimenter.autogluon import GluonExperimenter
 from expo.experimenter.autosklearn import AutoSklearnExperimenter
@@ -68,6 +69,7 @@ async def main(args):
         args.external_eval = False
         args.eval_func = "mlebench"
         args.from_scratch = True
+        args.task = get_mle_task_id(args.custom_dataset_dir)
 
     if args.exp_mode == "mcts":
         experimenter = MCTSExperimenter(args)

From a46f5753612c9e5e5e6a948f309873f994b16174 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 10:11:31 +0800
Subject: [PATCH 14/22] clean up input argument

---
 expo/MCTS.py                      | 14 +++++++-------
 expo/experimenter/custom.py       |  4 +---
 expo/experimenter/experimenter.py |  3 ---
 expo/run_experiment.py            | 17 +++++++++--------
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/expo/MCTS.py b/expo/MCTS.py
index 1eb8a131c..8778554ed 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -29,16 +29,14 @@ def initialize_di_root_node(state, reflection: bool = True):
     return role, Node(parent=None, state=state, action=None, value=0)
 
 
-def create_initial_state(
-    task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str, args
-):
+def create_initial_state(task, start_task_id, data_config, args):
     external_eval = args.external_eval
 
     if args.custom_dataset_dir:
         dataset_config = None
         datasets_dir = args.custom_dataset_dir
         requirement = get_mle_bench_requirements(
-            args.custom_dataset_dir, data_config, special_instruction=special_instruction
+            args.custom_dataset_dir, data_config, special_instruction=args.special_instruction
         )
         exp_pool_path = None
         # external_eval = False # make sure external eval is false if custom dataset is used
@@ -46,20 +44,22 @@ def create_initial_state(
     else:
         dataset_config = data_config["datasets"][task]
         datasets_dir = get_split_dataset_path(task, data_config)
-        requirement = generate_task_requirement(task, data_config, is_di=True, special_instruction=special_instruction)
+        requirement = generate_task_requirement(
+            task, data_config, is_di=True, special_instruction=args.special_instruction
+        )
         exp_pool_path = get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool")
 
     initial_state = {
         "task": task,
         "work_dir": data_config["work_dir"],
-        "node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{name}"),
+        "node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{args.name}"),
         "dataset_config": dataset_config,
         "datasets_dir": datasets_dir,  # won't be used if external eval is used
         "exp_pool_path": exp_pool_path,
         "requirement": requirement,
         "has_run": False,
         "start_task_id": start_task_id,
-        "low_is_better": low_is_better,
+        "low_is_better": args.low_is_better,
         "role_timeout": args.role_timeout,
         "external_eval": external_eval,
         "custom_dataset_dir": args.custom_dataset_dir,
diff --git a/expo/experimenter/custom.py b/expo/experimenter/custom.py
index 92b7dafa2..f245499ca 100644
--- a/expo/experimenter/custom.py
+++ b/expo/experimenter/custom.py
@@ -21,9 +21,7 @@ class CustomExperimenter(Experimenter):
             self.task,
             start_task_id=1,
             data_config=self.data_config,
-            low_is_better=self.low_is_better,
-            name=self.name,
-            special_instruction=self.args.special_instruction,
+            args=self.args,
         )
 
     def run_experiment(self):
diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py
index 417adabad..4a0b8413e 100644
--- a/expo/experimenter/experimenter.py
+++ b/expo/experimenter/experimenter.py
@@ -24,9 +24,6 @@ class Experimenter:
             self.args.task,
             start_task_id=self.start_task_id,
             data_config=self.data_config,
-            low_is_better=self.args.low_is_better,
-            name=self.args.name,
-            special_instruction=self.args.special_instruction,
             args=self.args,
         )
 
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index c977b4dc9..be891814d 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -24,9 +24,16 @@ def get_args(cmd=True):
     get_mcts_args(parser)
     get_aug_exp_args(parser)
     if cmd:
-        return parser.parse_args()
+        args = parser.parse_args()
     else:
-        return parser.parse_args("")
+        args = parser.parse_args("")
+
+    if args.custom_dataset_dir:
+        args.external_eval = False
+        args.eval_func = "mlebench"
+        args.from_scratch = True
+        args.task = get_mle_task_id(args.custom_dataset_dir)
+    return args
 
 
 def get_mcts_args(parser):
@@ -65,12 +72,6 @@ def get_di_args(parser):
 
 
 async def main(args):
-    if args.custom_dataset_dir:
-        args.external_eval = False
-        args.eval_func = "mlebench"
-        args.from_scratch = True
-        args.task = get_mle_task_id(args.custom_dataset_dir)
-
     if args.exp_mode == "mcts":
         experimenter = MCTSExperimenter(args)
     elif args.exp_mode == "greedy":

From 0f01c07b836fd756ce66e6a882fab8c0a1a27fff Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 10:27:33 +0800
Subject: [PATCH 15/22] add tree visualization script and function

---
 expo/evaluation/visualize_mcts.py    | 108 ++++++++++++++++++++++++++-
 expo/scripts/visualize_experiment.py |  23 ++++++
 2 files changed, 128 insertions(+), 3 deletions(-)
 create mode 100644 expo/scripts/visualize_experiment.py

diff --git a/expo/evaluation/visualize_mcts.py b/expo/evaluation/visualize_mcts.py
index d310036c0..e429789fd 100644
--- a/expo/evaluation/visualize_mcts.py
+++ b/expo/evaluation/visualize_mcts.py
@@ -1,5 +1,8 @@
 import textwrap
 
+import matplotlib.pyplot as plt
+import networkx as nx
+
 from expo.MCTS import Node
 
 NODE_TEMPLATE = """\
@@ -11,6 +14,9 @@ Score: {score}, Visits: {num_visits}
 
 """
 
+NODE_SIZE = 12000
+NODE_FONT_SIZE = 18
+
 
 def get_role_plans(role):
     plans = role.planner.plan.tasks
@@ -42,7 +48,7 @@ def get_tree_text(node: Node):
             id=node_id, plans=instruct_plans_text, simulated=simulated, score=score, num_visits=num_visits
         )
 
-    def visualize_tree(node, depth=0, previous_plans=None):
+    def visualize_tree_text(node, depth=0, previous_plans=None):
         text = ""
         if node is not None:
             text += visualize_node(node, previous_plans)
@@ -50,10 +56,106 @@ def get_tree_text(node: Node):
             code_set.update({task.instruction for task in role.planner.plan.tasks})
             previous_plans = get_role_plans(role)
             for child in node.children:
-                text += textwrap.indent(visualize_tree(child, depth + 1, previous_plans), "\t")
+                text += textwrap.indent(visualize_tree_text(child, depth + 1, previous_plans), "\t")
         return text
 
     num_simulations = node.visited
     text = f"Number of simulations: {num_simulations}\n"
-    text += visualize_tree(node)
+    text += visualize_tree_text(node)
     return text, len(code_set)
+
+
+def get_node_color(node):
+    if node["visits"] == 0:
+        return "#D3D3D3"
+    else:
+        # The higher the avg_value, the more intense the color
+        # avg_value is between 0 and 1
+        avg_value = node["avg_value"]
+        # Convert avg_value to a color ranging from red (low) to green (high)
+        red = int(255 * (1 - avg_value))
+        green = int(255 * avg_value)
+        return f"#{red:02X}{green:02X}00"
+
+
+def visualize_tree(graph, save_path=""):
+    # Use a hierarchical layout for tree-like visualization
+    pos = nx.spring_layout(graph, k=0.9, iterations=50)
+
+    plt.figure(figsize=(30, 20))  # Further increase figure size for better visibility
+
+    # Calculate node levels
+    root = "0"
+    levels = nx.single_source_shortest_path_length(graph, root)
+    max_level = max(levels.values())
+
+    # Adjust y-coordinates based on levels and x-coordinates to prevent overlap
+    nodes_by_level = {}
+    for node, level in levels.items():
+        if level not in nodes_by_level:
+            nodes_by_level[level] = []
+        nodes_by_level[level].append(node)
+
+    for level, nodes in nodes_by_level.items():
+        y = 1 - level / max_level
+        x_step = 1.0 / (len(nodes) + 1)
+        for i, node in enumerate(sorted(nodes)):
+            pos[node] = ((i + 1) * x_step, y)
+
+    # Draw edges
+    nx.draw_networkx_edges(graph, pos, edge_color="gray", arrows=True, arrowsize=40, width=3)
+
+    # Draw nodes
+    node_colors = [get_node_color(graph.nodes[node]) for node in graph.nodes]
+    nx.draw_networkx_nodes(graph, pos, node_size=NODE_SIZE, node_color=node_colors)
+
+    # Add labels to nodes
+    labels = nx.get_node_attributes(graph, "label")
+    nx.draw_networkx_labels(graph, pos, labels, font_size=NODE_FONT_SIZE)
+
+    # Add instructions to the right side of nodes
+    instructions = nx.get_node_attributes(graph, "instruction")
+    for node, (x, y) in pos.items():
+        wrapped_text = textwrap.fill(instructions[node], width=30)  # Adjust width as needed
+        plt.text(x + 0.05, y, wrapped_text, fontsize=15, ha="left", va="center")
+
+    plt.title("MCTS Tree Visualization", fontsize=40)
+    plt.axis("off")  # Turn off axis
+    plt.tight_layout()
+    if save_path:
+        plt.savefig(save_path)
+    plt.show()
+
+
+def build_tree_recursive(graph, parent_id, node, start_task_id=2):
+    """
+    Recursively builds the entire tree starting from the root node.
+    Adds nodes and edges to the NetworkX graph.
+    """
+    role = node.load_role()
+    depth = node.get_depth()
+    if depth == 0:
+        instruction = "\n\n".join([role.planner.plan.tasks[i].instruction for i in range(start_task_id)])
+    else:
+        instruction = role.planner.plan.tasks[depth + start_task_id - 1].instruction
+    print(instruction)
+    # Add the current node with attributes to the graph
+    dev_score = node.raw_reward.get("dev_score", 0) * 100
+    avg_score = node.avg_value() * 100
+    graph.add_node(
+        parent_id,
+        label=f"{node.id}\nAvg: {avg_score:.1f}\nScore: {dev_score:.1f}\nVisits: {node.visited}",
+        avg_value=node.avg_value(),
+        dev_score=dev_score,
+        visits=node.visited,
+        instruction=instruction,
+    )
+    # Stopping condition: if the node has no children, return
+    if not node.children:
+        return
+
+    # Recursively create all child nodes
+    for i, child in enumerate(node.children):
+        child_id = f"{parent_id}-{i}"
+        graph.add_edge(parent_id, child_id)
+        build_tree_recursive(graph, child_id, child)
diff --git a/expo/scripts/visualize_experiment.py b/expo/scripts/visualize_experiment.py
new file mode 100644
index 000000000..c06b1eeab
--- /dev/null
+++ b/expo/scripts/visualize_experiment.py
@@ -0,0 +1,23 @@
+import networkx as nx
+
+from expo.evaluation.visualize_mcts import build_tree_recursive, visualize_tree
+from expo.MCTS import MCTS, create_initial_state, initialize_di_root_node
+from expo.run_experiment import get_args
+from expo.utils import DATA_CONFIG
+
+if __name__ == "__main__":
+    args = get_args()
+    data_config = DATA_CONFIG
+    state = create_initial_state(args.task, 0, data_config, args=args)
+    role, node = initialize_di_root_node(state)
+    mcts = MCTS(
+        root_node=node,
+        max_depth=5,
+        use_fixed_insights=False,
+    )
+
+    mcts.load_tree()
+    root = mcts.root_node
+    G = nx.DiGraph()
+    tree = build_tree_recursive(G, "0", root)
+    visualize_tree(tree, save_path="../results/tree.png")

From 1d22466ac53f848dfed275c6dfb08425efd130b3 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 10:29:46 +0800
Subject: [PATCH 16/22] change dir for tree fig

---
 expo/scripts/visualize_experiment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/expo/scripts/visualize_experiment.py b/expo/scripts/visualize_experiment.py
index c06b1eeab..940d1f11b 100644
--- a/expo/scripts/visualize_experiment.py
+++ b/expo/scripts/visualize_experiment.py
@@ -20,4 +20,4 @@ if __name__ == "__main__":
     root = mcts.root_node
     G = nx.DiGraph()
     tree = build_tree_recursive(G, "0", root)
-    visualize_tree(tree, save_path="../results/tree.png")
+    visualize_tree(tree, save_path="results/tree.png")

From 6646983a255dc81057be91faa47d7bb6d98bae93 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 10:34:05 +0800
Subject: [PATCH 17/22] fix visualization bug

---
 expo/scripts/visualize_experiment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/expo/scripts/visualize_experiment.py b/expo/scripts/visualize_experiment.py
index 940d1f11b..42b4490ec 100644
--- a/expo/scripts/visualize_experiment.py
+++ b/expo/scripts/visualize_experiment.py
@@ -19,5 +19,5 @@ if __name__ == "__main__":
     mcts.load_tree()
     root = mcts.root_node
     G = nx.DiGraph()
-    tree = build_tree_recursive(G, "0", root)
-    visualize_tree(tree, save_path="results/tree.png")
+    build_tree_recursive(G, "0", root)
+    visualize_tree(G, save_path="results/tree.png")

From 510136ab17e32e05d5a443fe832d57a7ba154605 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 10:37:16 +0800
Subject: [PATCH 18/22] allowing whether to show instructions

---
 expo/evaluation/visualize_mcts.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/expo/evaluation/visualize_mcts.py b/expo/evaluation/visualize_mcts.py
index e429789fd..6a8869670 100644
--- a/expo/evaluation/visualize_mcts.py
+++ b/expo/evaluation/visualize_mcts.py
@@ -78,7 +78,7 @@ def get_node_color(node):
         return f"#{red:02X}{green:02X}00"
 
 
-def visualize_tree(graph, save_path=""):
+def visualize_tree(graph, show_instructions=False, save_path=""):
     # Use a hierarchical layout for tree-like visualization
     pos = nx.spring_layout(graph, k=0.9, iterations=50)
 
@@ -113,11 +113,12 @@ def visualize_tree(graph, save_path=""):
     labels = nx.get_node_attributes(graph, "label")
     nx.draw_networkx_labels(graph, pos, labels, font_size=NODE_FONT_SIZE)
 
-    # Add instructions to the right side of nodes
-    instructions = nx.get_node_attributes(graph, "instruction")
-    for node, (x, y) in pos.items():
-        wrapped_text = textwrap.fill(instructions[node], width=30)  # Adjust width as needed
-        plt.text(x + 0.05, y, wrapped_text, fontsize=15, ha="left", va="center")
+    if show_instructions:
+        # Add instructions to the right side of nodes
+        instructions = nx.get_node_attributes(graph, "instruction")
+        for node, (x, y) in pos.items():
+            wrapped_text = textwrap.fill(instructions[node], width=30)  # Adjust width as needed
+            plt.text(x + 0.05, y, wrapped_text, fontsize=15, ha="left", va="center")
 
     plt.title("MCTS Tree Visualization", fontsize=40)
     plt.axis("off")  # Turn off axis

From 06710fbc18a9298ccf257edf7930100cce766bb9 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 15:24:22 +0800
Subject: [PATCH 19/22] fix typo in readme.md

---
 expo/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/expo/README.md b/expo/README.md
index 6704582b8..1d0d8476d 100644
--- a/expo/README.md
+++ b/expo/README.md
@@ -65,7 +65,7 @@ #### Run
 
 If the dataset has reg metric, remember to use `--low_is_better`:
 
-- `python run_experiment.py --exp_mode mcts --task house_prices --rollouts 10 --low_is_better`
+- `python run_experiment.py --exp_mode mcts --task house-prices --rollouts 10 --low_is_better`
 
 
 In addition to the generated insights, include the fixed insights saved in `expo/insights/fixed_insights.json`

From 852fbc58ee9eaa77160dcd1bcdcd533c896ed6a9 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 17:47:41 +0800
Subject: [PATCH 20/22] automatically update args.low_is_better for mle-bench

---
 expo/data/custom_task.py | 9 +++++++++
 expo/run_experiment.py   | 4 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py
index c4dd0012c..f3cd433f5 100644
--- a/expo/data/custom_task.py
+++ b/expo/data/custom_task.py
@@ -35,6 +35,15 @@ def get_mle_task_id(dataset_dir):
     return dataset_dir.split("/")[-3]
 
 
+def get_mle_is_lower_better(task):
+    from mlebench.data import get_leaderboard
+    from mlebench.registry import registry
+
+    competition = registry.get_competition(task)
+    competition_leaderboard = get_leaderboard(competition)
+    return competition.grader.is_lower_better(competition_leaderboard)
+
+
 def get_mle_bench_requirements(dataset_dir, data_config, special_instruction, obfuscated=False):
     work_dir = data_config["work_dir"]
     task = get_mle_task_id(dataset_dir)
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index be891814d..7b49e6738 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -1,7 +1,7 @@
 import argparse
 import asyncio
 
-from expo.data.custom_task import get_mle_task_id
+from expo.data.custom_task import get_mle_is_lower_better, get_mle_task_id
 from expo.experimenter.aug import AugExperimenter
 from expo.experimenter.autogluon import GluonExperimenter
 from expo.experimenter.autosklearn import AutoSklearnExperimenter
@@ -33,6 +33,8 @@ def get_args(cmd=True):
         args.eval_func = "mlebench"
         args.from_scratch = True
         args.task = get_mle_task_id(args.custom_dataset_dir)
+        args.low_is_better = get_mle_is_lower_better(args.task)
+        print("low_is_better:", args.low_is_better)
     return args
 
 

From 6f437bb76d3897a687206ce9a7e9392d149dffc7 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 17:49:31 +0800
Subject: [PATCH 21/22] automatically change low_is_better for rmse

---
 expo/MCTS.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/expo/MCTS.py b/expo/MCTS.py
index 8778554ed..2ce559ae0 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -43,6 +43,8 @@ def create_initial_state(task, start_task_id, data_config, args):
         task = get_mle_task_id(args.custom_dataset_dir)
     else:
         dataset_config = data_config["datasets"][task]
+        if dataset_config["metric"] == "rmse":
+            args.low_is_better = True
         datasets_dir = get_split_dataset_path(task, data_config)
         requirement = generate_task_requirement(
             task, data_config, is_di=True, special_instruction=args.special_instruction

From de42e32b8e5a5293365852fde63ccaf0f69d4d95 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Thu, 17 Oct 2024 17:55:24 +0800
Subject: [PATCH 22/22] automatically update low_is_better for our task

---
 expo/insights/instruction_generator.py | 4 +++-
 expo/run_experiment.py                 | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py
index 835c1ff9d..78b32e45d 100644
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@@ -41,7 +41,9 @@ class InstructionGenerator:
             with open(f"{state['custom_dataset_dir']}/description.md", "r", encoding="utf-8") as file:
                 self.dataset_info = file.read()
         else:
-            dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
+            dataset_info_path = (
+                f"{self.data_config['datasets_dir']}/{state['dataset_config']['dataset']}/dataset_info.json"
+            )
             with open(dataset_info_path, "r") as file:
                 self.dataset_info = json.load(file)
         self.use_fixed_insights = use_fixed_insights
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index 7b49e6738..68c3b35d4 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -34,7 +34,6 @@ def get_args(cmd=True):
         args.from_scratch = True
         args.task = get_mle_task_id(args.custom_dataset_dir)
         args.low_is_better = get_mle_is_lower_better(args.task)
-        print("low_is_better:", args.low_is_better)
     return args