1. add eval_func for sela and compatibility to others

2. llm extract score (use all code block and execution results)
3. add argument for custom dataset dir
4. dataset custom requirement support
This commit is contained in:
Yizhou Chi 2024-10-12 17:16:51 +08:00
parent eda9322361
commit 3a57060e25
11 changed files with 202 additions and 38 deletions

View file

@ -3,10 +3,12 @@ import math
import os
import pickle
import random
import shutil
import numpy as np
import pandas as pd
from expo.data.custom_task import get_mle_bench_requirements
from expo.data.dataset import generate_task_requirement, get_split_dataset_path
from expo.evaluation.evaluation import evaluate_score
from expo.insights.instruction_generator import InstructionGenerator
@ -17,9 +19,6 @@ from metagpt.utils.common import read_json_file
def initialize_di_root_node(state, reflection: bool = True):
# state = create_initial_state(
# task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
# )
role = ResearchAssistant(
node_id="0", start_task_id=state["start_task_id"], use_reflection=reflection, role_dir=state["node_dir"]
)
@ -29,20 +28,33 @@ def initialize_di_root_node(state, reflection: bool = True):
def create_initial_state(
task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str, args
):
external_eval = args.external_eval
if args.custom_dataset_dir:
dataset_config = None
datasets_dir = args.custom_dataset_dir
requirement = get_mle_bench_requirements(args.custom_dataset_dir, data_config)
exp_pool_path = None
else:
dataset_config = data_config["datasets"][task]
datasets_dir = get_split_dataset_path(task, data_config)
requirement = generate_task_requirement(task, data_config, is_di=True, special_instruction=special_instruction)
exp_pool_path = get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool")
initial_state = {
"task": task,
"work_dir": data_config["work_dir"],
"node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{name}"),
"dataset_config": data_config["datasets"][task],
"datasets_dir": get_split_dataset_path(task, data_config),
"exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"),
"requirement": generate_task_requirement(
task, data_config, is_di=True, special_instruction=special_instruction
),
"dataset_config": dataset_config,
"datasets_dir": datasets_dir, # won't be used if external eval is used
"exp_pool_path": exp_pool_path,
"requirement": requirement,
"has_run": False,
"start_task_id": start_task_id,
"low_is_better": low_is_better,
"role_timeout": args.role_timeout,
"external_eval": external_eval,
"custom_dataset_dir": args.custom_dataset_dir,
}
os.makedirs(initial_state["node_dir"], exist_ok=True)
return initial_state
@ -173,22 +185,34 @@ class Node:
node.save_new_role(new_role)
self.add_child(node)
def evaluate_prediction(self, split):
pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
pred_node_path = os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
def get_predictions_path(self, split):
return os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
def get_and_move_predictions(self, split):
if not os.path.exists(self.get_predictions_path(split)):
pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
shutil.copy(pred_path, self.get_predictions_path(split))
os.remove(pred_path)
return pd.read_csv(self.get_predictions_path(split))
def get_gt(self, split):
gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
preds = pd.read_csv(pred_path)["target"]
preds.to_csv(pred_node_path, index=False)
gt = pd.read_csv(gt_path)["target"]
return pd.read_csv(gt_path)
def evaluate_prediction(self, split):
preds = self.get_and_move_predictions(split)["target"]
gt = self.get_gt(split)["target"]
metric = self.state["dataset_config"]["metric"]
# remove original predictions.csv
os.remove(pred_path)
return evaluate_score(preds, gt, metric)
def evaluate_simulation(self, score_dict):
scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
scores["score"] = scores["dev_score"]
score_dict.update(scores)
if self.state["external_eval"]: # use external evaluation
scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
scores["score"] = scores["dev_score"]
score_dict.update(scores)
else:
self.get_and_move_predictions("dev")
self.get_and_move_predictions("test")
return score_dict
async def run_node(self, role=None):
@ -215,7 +239,6 @@ class Node:
mcts_logger.log("MCTS", f"Role-level timeout: {e}")
break
except Exception as e:
print(f"Error: {e}")
mcts_logger.log("MCTS", f"Error in running the role: {e}")
num_runs += 1

38
expo/data/custom_task.py Normal file
View file

@ -0,0 +1,38 @@
import os
from expo.experimenter.mle_bench.instructions import (
ADDITIONAL_NOTES,
INSTRUCTIONS,
INSTRUCTIONS_OBFUSCATED,
)
MLE_BENCH_FILES = ["description.md", "description_obfuscated.md"]
MLE_REQUIREMENTS = """
{instructions}
{additonal_notes}
COMPETITION INSTRUCTIONS
------
{task_description}
"""
def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
if obfuscated:
instructions = INSTRUCTIONS_OBFUSCATED
task_file = "description_obfuscated.md"
else:
instructions = INSTRUCTIONS
task_file = "description.md"
with open(os.path.join(dataset_dir, task_file)) as f:
task_description = f.read()
mle_requirement = MLE_REQUIREMENTS.format(
instructions=instructions, additonal_notes=ADDITIONAL_NOTES, task_description=task_description
)
return mle_requirement

View file

@ -268,7 +268,7 @@ class ExpDataset:
dataset_info = self.get_dataset_info()
num_classes = dataset_info["metadata"]["NumberOfClasses"]
if num_classes == 2:
metric = "f1"
metric = "f1 binary"
elif 2 < num_classes <= 200:
metric = "f1 weighted"
elif num_classes > 200 or num_classes == 0:

View file

@ -22,3 +22,15 @@ def evaluate_score(pred, gt, metric):
return mean_squared_error(np.log1p(gt), np.log1p(pred), squared=False)
else:
raise ValueError(f"Metric {metric} not supported")
def node_evaluate_score_sela(node):
preds = node.get_and_move_predictions("test")["target"]
gt = node.get_gt("test")["target"]
metric = node.state["dataset_config"]["metric"]
return evaluate_score(preds, gt, metric)
def node_evaluate_score_mlebench(node):
# TODO
return 0

View file

@ -43,7 +43,10 @@ class Experimenter:
except Exception as e:
print(f"Error: {e}")
num_runs += 1
save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
# save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
save_name = self.get_save_name()
save_notebook(role=di, save_dir=self.result_path, name=f"{save_name}_{run_idx}")
if not run_finished:
score_dict = {"train_score": -1, "dev_score": -1, "test_score": -1, "score": -1}
return score_dict

View file

@ -1,5 +1,9 @@
import shutil
from expo.evaluation.evaluation import (
node_evaluate_score_mlebench,
node_evaluate_score_sela,
)
from expo.evaluation.visualize_mcts import get_tree_text
from expo.experimenter.experimenter import Experimenter
from expo.Greedy import Greedy, Random
@ -14,25 +18,35 @@ class MCTSExperimenter(Experimenter):
self.start_task_id = 1 # start from datapreprocessing if it is image task
else:
self.start_task_id = args.start_task_id
if args.eval_func == "sela":
self.eval_func = node_evaluate_score_sela
elif args.eval_func == "mlebench":
self.eval_func = node_evaluate_score_mlebench
super().__init__(args, **kwargs)
self.tree_mode = tree_mode
async def run_experiment(self):
use_fixed_insights = self.args.use_fixed_insights
depth = 5
if self.tree_mode == "greedy":
mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
mcts = Greedy(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
elif self.tree_mode == "random":
mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
mcts = Random(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
else:
mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
mcts = MCTS(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
best_nodes = await mcts.search(state=self.state, args=self.args)
best_node = best_nodes["global_best"]
dev_best_node = best_nodes["dev_best"]
score_dict = best_nodes["scores"]
additional_scores = {"grader": self.eval_func(dev_best_node)}
text, num_generated_codes = get_tree_text(mcts.root_node)
text += f"Generated {num_generated_codes} unique codes.\n"
text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
text += f"Grader score: {additional_scores['grader']}\n"
print(text)
results = [
{
@ -45,6 +59,7 @@ class MCTSExperimenter(Experimenter):
"tree_text": text,
"args": vars(self.args),
"scores": score_dict,
"additional_scores": additional_scores,
}
]
self.save_result(results)

View file

@ -0,0 +1,47 @@
INSTRUCTIONS = """
You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself.
You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
BENCHMARK INSTRUCTIONS
------
First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
- **Competition-specific instructions**: The instructions for the competition are available in `/home/data/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
- **Dataset**: The dataset files for the competition are available in `/home/data/`.
- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
- **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified.
Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task.
Wherever there is a conflict between the points above and the competition-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
"""
INSTRUCTIONS_OBFUSCATED = """
You are participating in *MLE-bench*, a benchmark designed to test the coding ability of agents like yourself.
You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
BENCHMARK INSTRUCTIONS
------
First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
- **Task-specific instructions**: The instructions for the task are available in `/home/data/description.md`. Read this file to understand the problem setup for the task you are currently solving.
- **Dataset**: The dataset files for the task are available in `/home/data/`.
- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
- **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified.
Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task.
Wherever there is a conflict between the points above and the task-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
"""
ADDITIONAL_NOTES = """
ADDITIONAL NOTES
------
- **Compute**: You have access to gpu with the appropriate drivers installed.
"""

View file

@ -37,12 +37,18 @@ class InstructionGenerator:
def __init__(self, state, use_fixed_insights, from_scratch):
self.state = state
self.file_path = state["exp_pool_path"]
self.dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
with open(self.dataset_info_path, "r") as file:
self.dataset_info = json.load(file)
if state["custom_dataset_dir"]:
self.dataset_info = "xxx"
else:
dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
with open(dataset_info_path, "r") as file:
self.dataset_info = json.load(file)
self.use_fixed_insights = use_fixed_insights
self.proposer = SolutionDesigner()
self.from_scratch = from_scratch
if self.file_path is None:
self.from_scratch = True
else:
self.from_scratch = from_scratch
async def initialize(self):
if self.from_scratch:

View file

@ -13,15 +13,19 @@ from metagpt.roles.di.data_interpreter import DataInterpreter
from metagpt.schema import Message, Task, TaskResult
from metagpt.utils.common import CodeParser, write_json_file
EXTRACT_SCORE_PROMPT = """
# Code:
CODE_BLOCK_RESULT = """
## Code:
{code}
# Execution Result:
## Execution Result:
{result}
"""
EXTRACT_SCORE_PROMPT = """
# Code Blocks
{code_block}
# Instruction:
Based on the code and execution result, please extract the scores and return it as a dictionary.
Based on the code and execution result, please extract the **final scores** and return it as a dictionary.
If you cannot find the scores, please still return a dictionary with the keys 'train_score', 'dev_score', and 'test_score', and set the values to -1.
# Format:
@ -109,9 +113,17 @@ class ResearchAssistant(DataInterpreter):
return score_dict
async def llm_extract_score(self):
result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code=code_text, result=result_text, role="user"))
# result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
# code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
num_tasks = len(self.planner.plan.task_map)
task_map = self.planner.plan.task_map
code_block = "\n".join(
[
CODE_BLOCK_RESULT.format(code=task_map[str(i + 1)].code, result=task_map[str(i + 1)].result)
for i in range(num_tasks)
]
)
rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code_block=code_block, role="user"))
json_block = CodeParser.parse_code(block=None, text=rsp)
score_dict = json.loads(json_block)
return score_dict
@ -161,7 +173,7 @@ class ResearchAssistant(DataInterpreter):
stg_path = self.role_dir
name = self.get_node_name()
role_path = os.path.join(stg_path, f"{name}.json")
# 将状态保存为 JSON 文件
# save state as json file
write_json_file(role_path, self.model_dump())
def remap_tasks(self):

View file

@ -31,10 +31,16 @@ def get_mcts_args(parser):
parser.set_defaults(load_tree=False)
parser.add_argument("--rollouts", type=int, default=5)
parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")
parser.set_defaults(use_fixed_insights=False)
parser.add_argument("--start_task_id", type=int, default=2)
parser.add_argument(
"--from_scratch", dest="from_scratch", action="store_true", help="Generate solutions from scratch"
)
parser.set_defaults(from_scratch=False)
parser.add_argument("--no_external_eval", dest="external_eval", action="store_false")
parser.set_defaults(external_eval=True)
parser.add_argument("--eval_func", type=str, default="sela", choices=["sela", "mlebench"])
parser.add_argument("--custom_dataset_dir", type=str, default=None)
def get_aug_exp_args(parser):

View file

@ -51,6 +51,8 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"):
f"Dataset {task_name} not found in config file. Available datasets: {data_config['datasets'].keys()}"
)
exp_pool_path = os.path.join(data_path, f"{pool_name}.json")
if not os.path.exists(exp_pool_path):
return None
return exp_pool_path