From 3a57060e25a8acfd2ed0f80b4d68a5a110425159 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Sat, 12 Oct 2024 17:16:51 +0800
Subject: [PATCH] 1. add eval_func for sela and compatibility to others 2. llm
 extract score (use all code block and execution results) 3. add argument for
 custom dataset dir 4. dataset custom requirement support

---
 expo/MCTS.py                                | 65 ++++++++++++++-------
 expo/data/custom_task.py                    | 38 ++++++++++++
 expo/data/dataset.py                        |  2 +-
 expo/evaluation/evaluation.py               | 12 ++++
 expo/experimenter/experimenter.py           |  5 +-
 expo/experimenter/mcts.py                   | 21 ++++++-
 expo/experimenter/mle_bench/instructions.py | 47 +++++++++++++++
 expo/insights/instruction_generator.py      | 14 +++--
 expo/research_assistant.py                  | 28 ++++++---
 expo/run_experiment.py                      |  6 ++
 expo/utils.py                               |  2 +
 11 files changed, 202 insertions(+), 38 deletions(-)
 create mode 100644 expo/data/custom_task.py
 create mode 100644 expo/experimenter/mle_bench/instructions.py

diff --git a/expo/MCTS.py b/expo/MCTS.py
index 7e1d7c88a..a8410748e 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -3,10 +3,12 @@ import math
 import os
 import pickle
 import random
+import shutil
 
 import numpy as np
 import pandas as pd
 
+from expo.data.custom_task import get_mle_bench_requirements
 from expo.data.dataset import generate_task_requirement, get_split_dataset_path
 from expo.evaluation.evaluation import evaluate_score
 from expo.insights.instruction_generator import InstructionGenerator
@@ -17,9 +19,6 @@ from metagpt.utils.common import read_json_file
 
 
 def initialize_di_root_node(state, reflection: bool = True):
-    # state = create_initial_state(
-    #     task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
-    # )
     role = ResearchAssistant(
         node_id="0", start_task_id=state["start_task_id"], use_reflection=reflection, role_dir=state["node_dir"]
     )
@@ -29,20 +28,33 @@ def initialize_di_root_node(state, reflection: bool = True):
 def create_initial_state(
     task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str, args
 ):
+    external_eval = args.external_eval
+
+    if args.custom_dataset_dir:
+        dataset_config = None
+        datasets_dir = args.custom_dataset_dir
+        requirement = get_mle_bench_requirements(args.custom_dataset_dir, data_config)
+        exp_pool_path = None
+    else:
+        dataset_config = data_config["datasets"][task]
+        datasets_dir = get_split_dataset_path(task, data_config)
+        requirement = generate_task_requirement(task, data_config, is_di=True, special_instruction=special_instruction)
+        exp_pool_path = get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool")
+
     initial_state = {
         "task": task,
         "work_dir": data_config["work_dir"],
         "node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{name}"),
-        "dataset_config": data_config["datasets"][task],
-        "datasets_dir": get_split_dataset_path(task, data_config),
-        "exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"),
-        "requirement": generate_task_requirement(
-            task, data_config, is_di=True, special_instruction=special_instruction
-        ),
+        "dataset_config": dataset_config,
+        "datasets_dir": datasets_dir,  # won't be used if external eval is used
+        "exp_pool_path": exp_pool_path,
+        "requirement": requirement,
         "has_run": False,
         "start_task_id": start_task_id,
         "low_is_better": low_is_better,
         "role_timeout": args.role_timeout,
+        "external_eval": external_eval,
+        "custom_dataset_dir": args.custom_dataset_dir,
     }
     os.makedirs(initial_state["node_dir"], exist_ok=True)
     return initial_state
@@ -173,22 +185,34 @@ class Node:
             node.save_new_role(new_role)
             self.add_child(node)
 
-    def evaluate_prediction(self, split):
-        pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
-        pred_node_path = os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
+    def get_predictions_path(self, split):
+        return os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
+
+    def get_and_move_predictions(self, split):
+        if not os.path.exists(self.get_predictions_path(split)):
+            pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
+            shutil.copy(pred_path, self.get_predictions_path(split))
+            os.remove(pred_path)
+        return pd.read_csv(self.get_predictions_path(split))
+
+    def get_gt(self, split):
         gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
-        preds = pd.read_csv(pred_path)["target"]
-        preds.to_csv(pred_node_path, index=False)
-        gt = pd.read_csv(gt_path)["target"]
+        return pd.read_csv(gt_path)
+
+    def evaluate_prediction(self, split):
+        preds = self.get_and_move_predictions(split)["target"]
+        gt = self.get_gt(split)["target"]
         metric = self.state["dataset_config"]["metric"]
-        # remove original predictions.csv
-        os.remove(pred_path)
         return evaluate_score(preds, gt, metric)
 
     def evaluate_simulation(self, score_dict):
-        scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
-        scores["score"] = scores["dev_score"]
-        score_dict.update(scores)
+        if self.state["external_eval"]:  # use external evaluation
+            scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
+            scores["score"] = scores["dev_score"]
+            score_dict.update(scores)
+        else:
+            self.get_and_move_predictions("dev")
+            self.get_and_move_predictions("test")
         return score_dict
 
     async def run_node(self, role=None):
@@ -215,7 +239,6 @@ class Node:
                 mcts_logger.log("MCTS", f"Role-level timeout: {e}")
                 break
             except Exception as e:
-                print(f"Error: {e}")
                 mcts_logger.log("MCTS", f"Error in running the role: {e}")
                 num_runs += 1
 
diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py
new file mode 100644
index 000000000..2bd88abde
--- /dev/null
+++ b/expo/data/custom_task.py
@@ -0,0 +1,38 @@
+import os
+
+from expo.experimenter.mle_bench.instructions import (
+    ADDITIONAL_NOTES,
+    INSTRUCTIONS,
+    INSTRUCTIONS_OBFUSCATED,
+)
+
+MLE_BENCH_FILES = ["description.md", "description_obfuscated.md"]
+
+
+MLE_REQUIREMENTS = """
+{instructions}
+
+{additonal_notes}
+
+COMPETITION INSTRUCTIONS
+------
+
+{task_description}
+
+"""
+
+
+def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
+    if obfuscated:
+        instructions = INSTRUCTIONS_OBFUSCATED
+        task_file = "description_obfuscated.md"
+    else:
+        instructions = INSTRUCTIONS
+        task_file = "description.md"
+
+    with open(os.path.join(dataset_dir, task_file)) as f:
+        task_description = f.read()
+    mle_requirement = MLE_REQUIREMENTS.format(
+        instructions=instructions, additonal_notes=ADDITIONAL_NOTES, task_description=task_description
+    )
+    return mle_requirement
diff --git a/expo/data/dataset.py b/expo/data/dataset.py
index e076284d6..8b0c5b980 100644
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@@ -268,7 +268,7 @@ class ExpDataset:
         dataset_info = self.get_dataset_info()
         num_classes = dataset_info["metadata"]["NumberOfClasses"]
         if num_classes == 2:
-            metric = "f1"
+            metric = "f1 binary"
         elif 2 < num_classes <= 200:
             metric = "f1 weighted"
         elif num_classes > 200 or num_classes == 0:
diff --git a/expo/evaluation/evaluation.py b/expo/evaluation/evaluation.py
index 16b3acb71..1ba7fa60f 100644
--- a/expo/evaluation/evaluation.py
+++ b/expo/evaluation/evaluation.py
@@ -22,3 +22,15 @@ def evaluate_score(pred, gt, metric):
         return mean_squared_error(np.log1p(gt), np.log1p(pred), squared=False)
     else:
         raise ValueError(f"Metric {metric} not supported")
+
+
+def node_evaluate_score_sela(node):
+    preds = node.get_and_move_predictions("test")["target"]
+    gt = node.get_gt("test")["target"]
+    metric = node.state["dataset_config"]["metric"]
+    return evaluate_score(preds, gt, metric)
+
+
+def node_evaluate_score_mlebench(node):
+    # TODO
+    return 0
diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py
index 9aa879e24..417adabad 100644
--- a/expo/experimenter/experimenter.py
+++ b/expo/experimenter/experimenter.py
@@ -43,7 +43,10 @@ class Experimenter:
             except Exception as e:
                 print(f"Error: {e}")
                 num_runs += 1
-        save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
+        # save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
+        save_name = self.get_save_name()
+        save_notebook(role=di, save_dir=self.result_path, name=f"{save_name}_{run_idx}")
+
         if not run_finished:
             score_dict = {"train_score": -1, "dev_score": -1, "test_score": -1, "score": -1}
         return score_dict
diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py
index d212eb204..37fc7a071 100644
--- a/expo/experimenter/mcts.py
+++ b/expo/experimenter/mcts.py
@@ -1,5 +1,9 @@
 import shutil
 
+from expo.evaluation.evaluation import (
+    node_evaluate_score_mlebench,
+    node_evaluate_score_sela,
+)
 from expo.evaluation.visualize_mcts import get_tree_text
 from expo.experimenter.experimenter import Experimenter
 from expo.Greedy import Greedy, Random
@@ -14,25 +18,35 @@ class MCTSExperimenter(Experimenter):
             self.start_task_id = 1  # start from datapreprocessing if it is image task
         else:
             self.start_task_id = args.start_task_id
+
+        if args.eval_func == "sela":
+            self.eval_func = node_evaluate_score_sela
+        elif args.eval_func == "mlebench":
+            self.eval_func = node_evaluate_score_mlebench
+
         super().__init__(args, **kwargs)
         self.tree_mode = tree_mode
 
     async def run_experiment(self):
+        use_fixed_insights = self.args.use_fixed_insights
+        depth = 5
         if self.tree_mode == "greedy":
-            mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
+            mcts = Greedy(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
         elif self.tree_mode == "random":
-            mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
+            mcts = Random(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
         else:
-            mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
+            mcts = MCTS(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
         best_nodes = await mcts.search(state=self.state, args=self.args)
         best_node = best_nodes["global_best"]
         dev_best_node = best_nodes["dev_best"]
         score_dict = best_nodes["scores"]
+        additional_scores = {"grader": self.eval_func(dev_best_node)}
 
         text, num_generated_codes = get_tree_text(mcts.root_node)
         text += f"Generated {num_generated_codes} unique codes.\n"
         text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
         text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
+        text += f"Grader score: {additional_scores['grader']}\n"
         print(text)
         results = [
             {
@@ -45,6 +59,7 @@ class MCTSExperimenter(Experimenter):
                 "tree_text": text,
                 "args": vars(self.args),
                 "scores": score_dict,
+                "additional_scores": additional_scores,
             }
         ]
         self.save_result(results)
diff --git a/expo/experimenter/mle_bench/instructions.py b/expo/experimenter/mle_bench/instructions.py
new file mode 100644
index 000000000..301b465c1
--- /dev/null
+++ b/expo/experimenter/mle_bench/instructions.py
@@ -0,0 +1,47 @@
+INSTRUCTIONS = """
+You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+
+- **Competition-specific instructions**: The instructions for the competition are available in `/home/data/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
+- **Dataset**: The dataset files for the competition are available in `/home/data/`.
+- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task. 
+
+Wherever there is a conflict between the points above and the competition-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+"""
+
+INSTRUCTIONS_OBFUSCATED = """
+You are participating in *MLE-bench*, a benchmark designed to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+
+- **Task-specific instructions**: The instructions for the task are available in `/home/data/description.md`. Read this file to understand the problem setup for the task you are currently solving.
+- **Dataset**: The dataset files for the task are available in `/home/data/`.
+- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task. 
+
+Wherever there is a conflict between the points above and the task-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+"""
+
+ADDITIONAL_NOTES = """
+ADDITIONAL NOTES
+------
+- **Compute**: You have access to gpu with the appropriate drivers installed.
+"""
diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py
index 7fa4d72ea..7fe5ceece 100644
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@@ -37,12 +37,18 @@ class InstructionGenerator:
     def __init__(self, state, use_fixed_insights, from_scratch):
         self.state = state
         self.file_path = state["exp_pool_path"]
-        self.dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
-        with open(self.dataset_info_path, "r") as file:
-            self.dataset_info = json.load(file)
+        if state["custom_dataset_dir"]:
+            self.dataset_info = "xxx"
+        else:
+            dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
+            with open(dataset_info_path, "r") as file:
+                self.dataset_info = json.load(file)
         self.use_fixed_insights = use_fixed_insights
         self.proposer = SolutionDesigner()
-        self.from_scratch = from_scratch
+        if self.file_path is None:
+            self.from_scratch = True
+        else:
+            self.from_scratch = from_scratch
 
     async def initialize(self):
         if self.from_scratch:
diff --git a/expo/research_assistant.py b/expo/research_assistant.py
index 0b53521a3..d068dd4e5 100644
--- a/expo/research_assistant.py
+++ b/expo/research_assistant.py
@@ -13,15 +13,19 @@ from metagpt.roles.di.data_interpreter import DataInterpreter
 from metagpt.schema import Message, Task, TaskResult
 from metagpt.utils.common import CodeParser, write_json_file
 
-EXTRACT_SCORE_PROMPT = """
-# Code:
+CODE_BLOCK_RESULT = """
+## Code:
 {code}
 
-# Execution Result:
+## Execution Result:
 {result}
+"""
 
+EXTRACT_SCORE_PROMPT = """
+# Code Blocks
+{code_block}
 # Instruction:
-Based on the code and execution result, please extract the scores and return it as a dictionary.
+Based on the code and execution result, please extract the **final scores** and return it as a dictionary.
 If you cannot find the scores, please still return a dictionary with the keys 'train_score', 'dev_score', and 'test_score', and set the values to -1.
 
 # Format:
@@ -109,9 +113,17 @@ class ResearchAssistant(DataInterpreter):
         return score_dict
 
     async def llm_extract_score(self):
-        result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
-        code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
-        rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code=code_text, result=result_text, role="user"))
+        # result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
+        # code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
+        num_tasks = len(self.planner.plan.task_map)
+        task_map = self.planner.plan.task_map
+        code_block = "\n".join(
+            [
+                CODE_BLOCK_RESULT.format(code=task_map[str(i + 1)].code, result=task_map[str(i + 1)].result)
+                for i in range(num_tasks)
+            ]
+        )
+        rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code_block=code_block, role="user"))
         json_block = CodeParser.parse_code(block=None, text=rsp)
         score_dict = json.loads(json_block)
         return score_dict
@@ -161,7 +173,7 @@ class ResearchAssistant(DataInterpreter):
         stg_path = self.role_dir
         name = self.get_node_name()
         role_path = os.path.join(stg_path, f"{name}.json")
-        # 将状态保存为 JSON 文件
+        # save state as json file
         write_json_file(role_path, self.model_dump())
 
     def remap_tasks(self):
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index c43da12fd..53fcdd18c 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -31,10 +31,16 @@ def get_mcts_args(parser):
     parser.set_defaults(load_tree=False)
     parser.add_argument("--rollouts", type=int, default=5)
     parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")
+    parser.set_defaults(use_fixed_insights=False)
     parser.add_argument("--start_task_id", type=int, default=2)
     parser.add_argument(
         "--from_scratch", dest="from_scratch", action="store_true", help="Generate solutions from scratch"
     )
+    parser.set_defaults(from_scratch=False)
+    parser.add_argument("--no_external_eval", dest="external_eval", action="store_false")
+    parser.set_defaults(external_eval=True)
+    parser.add_argument("--eval_func", type=str, default="sela", choices=["sela", "mlebench"])
+    parser.add_argument("--custom_dataset_dir", type=str, default=None)
 
 
 def get_aug_exp_args(parser):
diff --git a/expo/utils.py b/expo/utils.py
index b022879b0..f3381c91c 100644
--- a/expo/utils.py
+++ b/expo/utils.py
@@ -51,6 +51,8 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"):
             f"Dataset {task_name} not found in config file. Available datasets: {data_config['datasets'].keys()}"
         )
     exp_pool_path = os.path.join(data_path, f"{pool_name}.json")
+    if not os.path.exists(exp_pool_path):
+        return None
     return exp_pool_path