1. add eval_func for sela and compatibility to others

2. llm extract score (use all code block and execution results) 3. add argument for custom dataset dir 4. dataset custom requirement support
2026-07-17 16:41:05 +02:00 · 2024-10-12 17:16:51 +08:00 · 2024-10-12 17:16:51 +08:00 · 3a57060e25
commit 3a57060e25
parent eda9322361
11 changed files with 202 additions and 38 deletions
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@ -3,10 +3,12 @@ import math
 import os
 import pickle
 import random
+import shutil

 import numpy as np
 import pandas as pd

+from expo.data.custom_task import get_mle_bench_requirements
 from expo.data.dataset import generate_task_requirement, get_split_dataset_path
 from expo.evaluation.evaluation import evaluate_score
 from expo.insights.instruction_generator import InstructionGenerator
@ -17,9 +19,6 @@ from metagpt.utils.common import read_json_file


 def initialize_di_root_node(state, reflection: bool = True):
-    # state = create_initial_state(
-    #     task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
-    # )
    role = ResearchAssistant(
        node_id="0", start_task_id=state["start_task_id"], use_reflection=reflection, role_dir=state["node_dir"]
    )
@ -29,20 +28,33 @@ def initialize_di_root_node(state, reflection: bool = True):
 def create_initial_state(
    task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str, args
 ):
+    external_eval = args.external_eval
+
+    if args.custom_dataset_dir:
+        dataset_config = None
+        datasets_dir = args.custom_dataset_dir
+        requirement = get_mle_bench_requirements(args.custom_dataset_dir, data_config)
+        exp_pool_path = None
+    else:
+        dataset_config = data_config["datasets"][task]
+        datasets_dir = get_split_dataset_path(task, data_config)
+        requirement = generate_task_requirement(task, data_config, is_di=True, special_instruction=special_instruction)
+        exp_pool_path = get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool")
+
    initial_state = {
        "task": task,
        "work_dir": data_config["work_dir"],
        "node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{name}"),
-        "dataset_config": data_config["datasets"][task],
-        "datasets_dir": get_split_dataset_path(task, data_config),
-        "exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"),
-        "requirement": generate_task_requirement(
-            task, data_config, is_di=True, special_instruction=special_instruction
-        ),
+        "dataset_config": dataset_config,
+        "datasets_dir": datasets_dir,  # won't be used if external eval is used
+        "exp_pool_path": exp_pool_path,
+        "requirement": requirement,
        "has_run": False,
        "start_task_id": start_task_id,
        "low_is_better": low_is_better,
        "role_timeout": args.role_timeout,
+        "external_eval": external_eval,
+        "custom_dataset_dir": args.custom_dataset_dir,
    }
    os.makedirs(initial_state["node_dir"], exist_ok=True)
    return initial_state
@ -173,22 +185,34 @@ class Node:
            node.save_new_role(new_role)
            self.add_child(node)

-    def evaluate_prediction(self, split):
-        pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
-        pred_node_path = os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
+    def get_predictions_path(self, split):
+        return os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
+
+    def get_and_move_predictions(self, split):
+        if not os.path.exists(self.get_predictions_path(split)):
+            pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
+            shutil.copy(pred_path, self.get_predictions_path(split))
+            os.remove(pred_path)
+        return pd.read_csv(self.get_predictions_path(split))
+
+    def get_gt(self, split):
        gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
-        preds = pd.read_csv(pred_path)["target"]
-        preds.to_csv(pred_node_path, index=False)
-        gt = pd.read_csv(gt_path)["target"]
+        return pd.read_csv(gt_path)
+
+    def evaluate_prediction(self, split):
+        preds = self.get_and_move_predictions(split)["target"]
+        gt = self.get_gt(split)["target"]
        metric = self.state["dataset_config"]["metric"]
-        # remove original predictions.csv
-        os.remove(pred_path)
        return evaluate_score(preds, gt, metric)

    def evaluate_simulation(self, score_dict):
-        scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
-        scores["score"] = scores["dev_score"]
-        score_dict.update(scores)
+        if self.state["external_eval"]:  # use external evaluation
+            scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
+            scores["score"] = scores["dev_score"]
+            score_dict.update(scores)
+        else:
+            self.get_and_move_predictions("dev")
+            self.get_and_move_predictions("test")
        return score_dict

    async def run_node(self, role=None):
@ -215,7 +239,6 @@ class Node:
                mcts_logger.log("MCTS", f"Role-level timeout: {e}")
                break
            except Exception as e:
-                print(f"Error: {e}")
                mcts_logger.log("MCTS", f"Error in running the role: {e}")
                num_runs += 1

--- a/expo/data/custom_task.py
+++ b/expo/data/custom_task.py
@ -0,0 +1,38 @@
+import os
+
+from expo.experimenter.mle_bench.instructions import (
+    ADDITIONAL_NOTES,
+    INSTRUCTIONS,
+    INSTRUCTIONS_OBFUSCATED,
+)
+
+MLE_BENCH_FILES = ["description.md", "description_obfuscated.md"]
+
+
+MLE_REQUIREMENTS = """
+{instructions}
+
+{additonal_notes}
+
+COMPETITION INSTRUCTIONS
+------
+
+{task_description}
+
+"""
+
+
+def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
+    if obfuscated:
+        instructions = INSTRUCTIONS_OBFUSCATED
+        task_file = "description_obfuscated.md"
+    else:
+        instructions = INSTRUCTIONS
+        task_file = "description.md"
+
+    with open(os.path.join(dataset_dir, task_file)) as f:
+        task_description = f.read()
+    mle_requirement = MLE_REQUIREMENTS.format(
+        instructions=instructions, additonal_notes=ADDITIONAL_NOTES, task_description=task_description
+    )
+    return mle_requirement
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@ -268,7 +268,7 @@ class ExpDataset:
        dataset_info = self.get_dataset_info()
        num_classes = dataset_info["metadata"]["NumberOfClasses"]
        if num_classes == 2:
-            metric = "f1"
+            metric = "f1 binary"
        elif 2 < num_classes <= 200:
            metric = "f1 weighted"
        elif num_classes > 200 or num_classes == 0:
--- a/expo/evaluation/evaluation.py
+++ b/expo/evaluation/evaluation.py
@ -22,3 +22,15 @@ def evaluate_score(pred, gt, metric):
        return mean_squared_error(np.log1p(gt), np.log1p(pred), squared=False)
    else:
        raise ValueError(f"Metric {metric} not supported")
+
+
+def node_evaluate_score_sela(node):
+    preds = node.get_and_move_predictions("test")["target"]
+    gt = node.get_gt("test")["target"]
+    metric = node.state["dataset_config"]["metric"]
+    return evaluate_score(preds, gt, metric)
+
+
+def node_evaluate_score_mlebench(node):
+    # TODO
+    return 0
--- a/expo/experimenter/experimenter.py
+++ b/expo/experimenter/experimenter.py
@ -43,7 +43,10 @@ class Experimenter:
            except Exception as e:
                print(f"Error: {e}")
                num_runs += 1
-        save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
+        # save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
+        save_name = self.get_save_name()
+        save_notebook(role=di, save_dir=self.result_path, name=f"{save_name}_{run_idx}")
+
        if not run_finished:
            score_dict = {"train_score": -1, "dev_score": -1, "test_score": -1, "score": -1}
        return score_dict
--- a/expo/experimenter/mcts.py
+++ b/expo/experimenter/mcts.py
@ -1,5 +1,9 @@
 import shutil

+from expo.evaluation.evaluation import (
+    node_evaluate_score_mlebench,
+    node_evaluate_score_sela,
+)
 from expo.evaluation.visualize_mcts import get_tree_text
 from expo.experimenter.experimenter import Experimenter
 from expo.Greedy import Greedy, Random
@ -14,25 +18,35 @@ class MCTSExperimenter(Experimenter):
            self.start_task_id = 1  # start from datapreprocessing if it is image task
        else:
            self.start_task_id = args.start_task_id
+
+        if args.eval_func == "sela":
+            self.eval_func = node_evaluate_score_sela
+        elif args.eval_func == "mlebench":
+            self.eval_func = node_evaluate_score_mlebench
+
        super().__init__(args, **kwargs)
        self.tree_mode = tree_mode

    async def run_experiment(self):
+        use_fixed_insights = self.args.use_fixed_insights
+        depth = 5
        if self.tree_mode == "greedy":
-            mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
+            mcts = Greedy(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
        elif self.tree_mode == "random":
-            mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
+            mcts = Random(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
        else:
-            mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
+            mcts = MCTS(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
        best_nodes = await mcts.search(state=self.state, args=self.args)
        best_node = best_nodes["global_best"]
        dev_best_node = best_nodes["dev_best"]
        score_dict = best_nodes["scores"]
+        additional_scores = {"grader": self.eval_func(dev_best_node)}

        text, num_generated_codes = get_tree_text(mcts.root_node)
        text += f"Generated {num_generated_codes} unique codes.\n"
        text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
        text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
+        text += f"Grader score: {additional_scores['grader']}\n"
        print(text)
        results = [
            {
@ -45,6 +59,7 @@ class MCTSExperimenter(Experimenter):
                "tree_text": text,
                "args": vars(self.args),
                "scores": score_dict,
+                "additional_scores": additional_scores,
            }
        ]
        self.save_result(results)
--- a/expo/experimenter/mle_bench/instructions.py
+++ b/expo/experimenter/mle_bench/instructions.py
@ -0,0 +1,47 @@
+INSTRUCTIONS = """
+You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+
+- **Competition-specific instructions**: The instructions for the competition are available in `/home/data/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
+- **Dataset**: The dataset files for the competition are available in `/home/data/`.
+- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task. 
+
+Wherever there is a conflict between the points above and the competition-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+"""
+
+INSTRUCTIONS_OBFUSCATED = """
+You are participating in *MLE-bench*, a benchmark designed to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+
+- **Task-specific instructions**: The instructions for the task are available in `/home/data/description.md`. Read this file to understand the problem setup for the task you are currently solving.
+- **Dataset**: The dataset files for the task are available in `/home/data/`.
+- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task. 
+
+Wherever there is a conflict between the points above and the task-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
+"""
+
+ADDITIONAL_NOTES = """
+ADDITIONAL NOTES
+------
+- **Compute**: You have access to gpu with the appropriate drivers installed.
+"""
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@ -37,12 +37,18 @@ class InstructionGenerator:
    def __init__(self, state, use_fixed_insights, from_scratch):
        self.state = state
        self.file_path = state["exp_pool_path"]
-        self.dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
-        with open(self.dataset_info_path, "r") as file:
-            self.dataset_info = json.load(file)
+        if state["custom_dataset_dir"]:
+            self.dataset_info = "xxx"
+        else:
+            dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
+            with open(dataset_info_path, "r") as file:
+                self.dataset_info = json.load(file)
        self.use_fixed_insights = use_fixed_insights
        self.proposer = SolutionDesigner()
-        self.from_scratch = from_scratch
+        if self.file_path is None:
+            self.from_scratch = True
+        else:
+            self.from_scratch = from_scratch

    async def initialize(self):
        if self.from_scratch:
--- a/expo/research_assistant.py
+++ b/expo/research_assistant.py
@ -13,15 +13,19 @@ from metagpt.roles.di.data_interpreter import DataInterpreter
 from metagpt.schema import Message, Task, TaskResult
 from metagpt.utils.common import CodeParser, write_json_file

-EXTRACT_SCORE_PROMPT = """
-# Code:
+CODE_BLOCK_RESULT = """
+## Code:
 {code}

-# Execution Result:
+## Execution Result:
 {result}
+"""

+EXTRACT_SCORE_PROMPT = """
+# Code Blocks
+{code_block}
 # Instruction:
-Based on the code and execution result, please extract the scores and return it as a dictionary.
+Based on the code and execution result, please extract the **final scores** and return it as a dictionary.
 If you cannot find the scores, please still return a dictionary with the keys 'train_score', 'dev_score', and 'test_score', and set the values to -1.

 # Format:
@ -109,9 +113,17 @@ class ResearchAssistant(DataInterpreter):
        return score_dict

    async def llm_extract_score(self):
-        result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
-        code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
-        rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code=code_text, result=result_text, role="user"))
+        # result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
+        # code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
+        num_tasks = len(self.planner.plan.task_map)
+        task_map = self.planner.plan.task_map
+        code_block = "\n".join(
+            [
+                CODE_BLOCK_RESULT.format(code=task_map[str(i + 1)].code, result=task_map[str(i + 1)].result)
+                for i in range(num_tasks)
+            ]
+        )
+        rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code_block=code_block, role="user"))
        json_block = CodeParser.parse_code(block=None, text=rsp)
        score_dict = json.loads(json_block)
        return score_dict
@ -161,7 +173,7 @@ class ResearchAssistant(DataInterpreter):
        stg_path = self.role_dir
        name = self.get_node_name()
        role_path = os.path.join(stg_path, f"{name}.json")
-        # 将状态保存为 JSON 文件
+        # save state as json file
        write_json_file(role_path, self.model_dump())

    def remap_tasks(self):
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@ -31,10 +31,16 @@ def get_mcts_args(parser):
    parser.set_defaults(load_tree=False)
    parser.add_argument("--rollouts", type=int, default=5)
    parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")
+    parser.set_defaults(use_fixed_insights=False)
    parser.add_argument("--start_task_id", type=int, default=2)
    parser.add_argument(
        "--from_scratch", dest="from_scratch", action="store_true", help="Generate solutions from scratch"
    )
+    parser.set_defaults(from_scratch=False)
+    parser.add_argument("--no_external_eval", dest="external_eval", action="store_false")
+    parser.set_defaults(external_eval=True)
+    parser.add_argument("--eval_func", type=str, default="sela", choices=["sela", "mlebench"])
+    parser.add_argument("--custom_dataset_dir", type=str, default=None)


 def get_aug_exp_args(parser):
--- a/expo/utils.py
+++ b/expo/utils.py
@ -51,6 +51,8 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"):
            f"Dataset {task_name} not found in config file. Available datasets: {data_config['datasets'].keys()}"
        )
    exp_pool_path = os.path.join(data_path, f"{pool_name}.json")
+    if not os.path.exists(exp_pool_path):
+        return None
    return exp_pool_path