mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-30 14:35:17 +02:00
1. add eval_func for sela and compatibility to others
2. llm extract score (use all code block and execution results) 3. add argument for custom dataset dir 4. dataset custom requirement support
This commit is contained in:
parent
eda9322361
commit
3a57060e25
11 changed files with 202 additions and 38 deletions
65
expo/MCTS.py
65
expo/MCTS.py
|
|
@ -3,10 +3,12 @@ import math
|
|||
import os
|
||||
import pickle
|
||||
import random
|
||||
import shutil
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from expo.data.custom_task import get_mle_bench_requirements
|
||||
from expo.data.dataset import generate_task_requirement, get_split_dataset_path
|
||||
from expo.evaluation.evaluation import evaluate_score
|
||||
from expo.insights.instruction_generator import InstructionGenerator
|
||||
|
|
@ -17,9 +19,6 @@ from metagpt.utils.common import read_json_file
|
|||
|
||||
|
||||
def initialize_di_root_node(state, reflection: bool = True):
|
||||
# state = create_initial_state(
|
||||
# task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
|
||||
# )
|
||||
role = ResearchAssistant(
|
||||
node_id="0", start_task_id=state["start_task_id"], use_reflection=reflection, role_dir=state["node_dir"]
|
||||
)
|
||||
|
|
@ -29,20 +28,33 @@ def initialize_di_root_node(state, reflection: bool = True):
|
|||
def create_initial_state(
|
||||
task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str, args
|
||||
):
|
||||
external_eval = args.external_eval
|
||||
|
||||
if args.custom_dataset_dir:
|
||||
dataset_config = None
|
||||
datasets_dir = args.custom_dataset_dir
|
||||
requirement = get_mle_bench_requirements(args.custom_dataset_dir, data_config)
|
||||
exp_pool_path = None
|
||||
else:
|
||||
dataset_config = data_config["datasets"][task]
|
||||
datasets_dir = get_split_dataset_path(task, data_config)
|
||||
requirement = generate_task_requirement(task, data_config, is_di=True, special_instruction=special_instruction)
|
||||
exp_pool_path = get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool")
|
||||
|
||||
initial_state = {
|
||||
"task": task,
|
||||
"work_dir": data_config["work_dir"],
|
||||
"node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{name}"),
|
||||
"dataset_config": data_config["datasets"][task],
|
||||
"datasets_dir": get_split_dataset_path(task, data_config),
|
||||
"exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"),
|
||||
"requirement": generate_task_requirement(
|
||||
task, data_config, is_di=True, special_instruction=special_instruction
|
||||
),
|
||||
"dataset_config": dataset_config,
|
||||
"datasets_dir": datasets_dir, # won't be used if external eval is used
|
||||
"exp_pool_path": exp_pool_path,
|
||||
"requirement": requirement,
|
||||
"has_run": False,
|
||||
"start_task_id": start_task_id,
|
||||
"low_is_better": low_is_better,
|
||||
"role_timeout": args.role_timeout,
|
||||
"external_eval": external_eval,
|
||||
"custom_dataset_dir": args.custom_dataset_dir,
|
||||
}
|
||||
os.makedirs(initial_state["node_dir"], exist_ok=True)
|
||||
return initial_state
|
||||
|
|
@ -173,22 +185,34 @@ class Node:
|
|||
node.save_new_role(new_role)
|
||||
self.add_child(node)
|
||||
|
||||
def evaluate_prediction(self, split):
|
||||
pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
|
||||
pred_node_path = os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
|
||||
def get_predictions_path(self, split):
|
||||
return os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
|
||||
|
||||
def get_and_move_predictions(self, split):
|
||||
if not os.path.exists(self.get_predictions_path(split)):
|
||||
pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
|
||||
shutil.copy(pred_path, self.get_predictions_path(split))
|
||||
os.remove(pred_path)
|
||||
return pd.read_csv(self.get_predictions_path(split))
|
||||
|
||||
def get_gt(self, split):
|
||||
gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
|
||||
preds = pd.read_csv(pred_path)["target"]
|
||||
preds.to_csv(pred_node_path, index=False)
|
||||
gt = pd.read_csv(gt_path)["target"]
|
||||
return pd.read_csv(gt_path)
|
||||
|
||||
def evaluate_prediction(self, split):
|
||||
preds = self.get_and_move_predictions(split)["target"]
|
||||
gt = self.get_gt(split)["target"]
|
||||
metric = self.state["dataset_config"]["metric"]
|
||||
# remove original predictions.csv
|
||||
os.remove(pred_path)
|
||||
return evaluate_score(preds, gt, metric)
|
||||
|
||||
def evaluate_simulation(self, score_dict):
|
||||
scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
|
||||
scores["score"] = scores["dev_score"]
|
||||
score_dict.update(scores)
|
||||
if self.state["external_eval"]: # use external evaluation
|
||||
scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
|
||||
scores["score"] = scores["dev_score"]
|
||||
score_dict.update(scores)
|
||||
else:
|
||||
self.get_and_move_predictions("dev")
|
||||
self.get_and_move_predictions("test")
|
||||
return score_dict
|
||||
|
||||
async def run_node(self, role=None):
|
||||
|
|
@ -215,7 +239,6 @@ class Node:
|
|||
mcts_logger.log("MCTS", f"Role-level timeout: {e}")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
mcts_logger.log("MCTS", f"Error in running the role: {e}")
|
||||
num_runs += 1
|
||||
|
||||
|
|
|
|||
38
expo/data/custom_task.py
Normal file
38
expo/data/custom_task.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
import os
|
||||
|
||||
from expo.experimenter.mle_bench.instructions import (
|
||||
ADDITIONAL_NOTES,
|
||||
INSTRUCTIONS,
|
||||
INSTRUCTIONS_OBFUSCATED,
|
||||
)
|
||||
|
||||
MLE_BENCH_FILES = ["description.md", "description_obfuscated.md"]
|
||||
|
||||
|
||||
MLE_REQUIREMENTS = """
|
||||
{instructions}
|
||||
|
||||
{additonal_notes}
|
||||
|
||||
COMPETITION INSTRUCTIONS
|
||||
------
|
||||
|
||||
{task_description}
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
|
||||
if obfuscated:
|
||||
instructions = INSTRUCTIONS_OBFUSCATED
|
||||
task_file = "description_obfuscated.md"
|
||||
else:
|
||||
instructions = INSTRUCTIONS
|
||||
task_file = "description.md"
|
||||
|
||||
with open(os.path.join(dataset_dir, task_file)) as f:
|
||||
task_description = f.read()
|
||||
mle_requirement = MLE_REQUIREMENTS.format(
|
||||
instructions=instructions, additonal_notes=ADDITIONAL_NOTES, task_description=task_description
|
||||
)
|
||||
return mle_requirement
|
||||
|
|
@ -268,7 +268,7 @@ class ExpDataset:
|
|||
dataset_info = self.get_dataset_info()
|
||||
num_classes = dataset_info["metadata"]["NumberOfClasses"]
|
||||
if num_classes == 2:
|
||||
metric = "f1"
|
||||
metric = "f1 binary"
|
||||
elif 2 < num_classes <= 200:
|
||||
metric = "f1 weighted"
|
||||
elif num_classes > 200 or num_classes == 0:
|
||||
|
|
|
|||
|
|
@ -22,3 +22,15 @@ def evaluate_score(pred, gt, metric):
|
|||
return mean_squared_error(np.log1p(gt), np.log1p(pred), squared=False)
|
||||
else:
|
||||
raise ValueError(f"Metric {metric} not supported")
|
||||
|
||||
|
||||
def node_evaluate_score_sela(node):
|
||||
preds = node.get_and_move_predictions("test")["target"]
|
||||
gt = node.get_gt("test")["target"]
|
||||
metric = node.state["dataset_config"]["metric"]
|
||||
return evaluate_score(preds, gt, metric)
|
||||
|
||||
|
||||
def node_evaluate_score_mlebench(node):
|
||||
# TODO
|
||||
return 0
|
||||
|
|
|
|||
|
|
@ -43,7 +43,10 @@ class Experimenter:
|
|||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
num_runs += 1
|
||||
save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
|
||||
# save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
|
||||
save_name = self.get_save_name()
|
||||
save_notebook(role=di, save_dir=self.result_path, name=f"{save_name}_{run_idx}")
|
||||
|
||||
if not run_finished:
|
||||
score_dict = {"train_score": -1, "dev_score": -1, "test_score": -1, "score": -1}
|
||||
return score_dict
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
import shutil
|
||||
|
||||
from expo.evaluation.evaluation import (
|
||||
node_evaluate_score_mlebench,
|
||||
node_evaluate_score_sela,
|
||||
)
|
||||
from expo.evaluation.visualize_mcts import get_tree_text
|
||||
from expo.experimenter.experimenter import Experimenter
|
||||
from expo.Greedy import Greedy, Random
|
||||
|
|
@ -14,25 +18,35 @@ class MCTSExperimenter(Experimenter):
|
|||
self.start_task_id = 1 # start from datapreprocessing if it is image task
|
||||
else:
|
||||
self.start_task_id = args.start_task_id
|
||||
|
||||
if args.eval_func == "sela":
|
||||
self.eval_func = node_evaluate_score_sela
|
||||
elif args.eval_func == "mlebench":
|
||||
self.eval_func = node_evaluate_score_mlebench
|
||||
|
||||
super().__init__(args, **kwargs)
|
||||
self.tree_mode = tree_mode
|
||||
|
||||
async def run_experiment(self):
|
||||
use_fixed_insights = self.args.use_fixed_insights
|
||||
depth = 5
|
||||
if self.tree_mode == "greedy":
|
||||
mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
|
||||
mcts = Greedy(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
|
||||
elif self.tree_mode == "random":
|
||||
mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
|
||||
mcts = Random(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
|
||||
else:
|
||||
mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
|
||||
mcts = MCTS(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
|
||||
best_nodes = await mcts.search(state=self.state, args=self.args)
|
||||
best_node = best_nodes["global_best"]
|
||||
dev_best_node = best_nodes["dev_best"]
|
||||
score_dict = best_nodes["scores"]
|
||||
additional_scores = {"grader": self.eval_func(dev_best_node)}
|
||||
|
||||
text, num_generated_codes = get_tree_text(mcts.root_node)
|
||||
text += f"Generated {num_generated_codes} unique codes.\n"
|
||||
text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
|
||||
text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
|
||||
text += f"Grader score: {additional_scores['grader']}\n"
|
||||
print(text)
|
||||
results = [
|
||||
{
|
||||
|
|
@ -45,6 +59,7 @@ class MCTSExperimenter(Experimenter):
|
|||
"tree_text": text,
|
||||
"args": vars(self.args),
|
||||
"scores": score_dict,
|
||||
"additional_scores": additional_scores,
|
||||
}
|
||||
]
|
||||
self.save_result(results)
|
||||
|
|
|
|||
47
expo/experimenter/mle_bench/instructions.py
Normal file
47
expo/experimenter/mle_bench/instructions.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
INSTRUCTIONS = """
|
||||
You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself.
|
||||
|
||||
You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
|
||||
|
||||
BENCHMARK INSTRUCTIONS
|
||||
------
|
||||
First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
|
||||
|
||||
- **Competition-specific instructions**: The instructions for the competition are available in `/home/data/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
|
||||
- **Dataset**: The dataset files for the competition are available in `/home/data/`.
|
||||
- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
|
||||
- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
|
||||
- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
|
||||
- **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified.
|
||||
|
||||
Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task.
|
||||
|
||||
Wherever there is a conflict between the points above and the competition-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
|
||||
"""
|
||||
|
||||
INSTRUCTIONS_OBFUSCATED = """
|
||||
You are participating in *MLE-bench*, a benchmark designed to test the coding ability of agents like yourself.
|
||||
|
||||
You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
|
||||
|
||||
BENCHMARK INSTRUCTIONS
|
||||
------
|
||||
First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
|
||||
|
||||
- **Task-specific instructions**: The instructions for the task are available in `/home/data/description.md`. Read this file to understand the problem setup for the task you are currently solving.
|
||||
- **Dataset**: The dataset files for the task are available in `/home/data/`.
|
||||
- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
|
||||
- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
|
||||
- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
|
||||
- **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified.
|
||||
|
||||
Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task.
|
||||
|
||||
Wherever there is a conflict between the points above and the task-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
|
||||
"""
|
||||
|
||||
ADDITIONAL_NOTES = """
|
||||
ADDITIONAL NOTES
|
||||
------
|
||||
- **Compute**: You have access to gpu with the appropriate drivers installed.
|
||||
"""
|
||||
|
|
@ -37,12 +37,18 @@ class InstructionGenerator:
|
|||
def __init__(self, state, use_fixed_insights, from_scratch):
|
||||
self.state = state
|
||||
self.file_path = state["exp_pool_path"]
|
||||
self.dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
|
||||
with open(self.dataset_info_path, "r") as file:
|
||||
self.dataset_info = json.load(file)
|
||||
if state["custom_dataset_dir"]:
|
||||
self.dataset_info = "xxx"
|
||||
else:
|
||||
dataset_info_path = f"{self.data_config['datasets_dir']}/{state['task']}/dataset_info.json"
|
||||
with open(dataset_info_path, "r") as file:
|
||||
self.dataset_info = json.load(file)
|
||||
self.use_fixed_insights = use_fixed_insights
|
||||
self.proposer = SolutionDesigner()
|
||||
self.from_scratch = from_scratch
|
||||
if self.file_path is None:
|
||||
self.from_scratch = True
|
||||
else:
|
||||
self.from_scratch = from_scratch
|
||||
|
||||
async def initialize(self):
|
||||
if self.from_scratch:
|
||||
|
|
|
|||
|
|
@ -13,15 +13,19 @@ from metagpt.roles.di.data_interpreter import DataInterpreter
|
|||
from metagpt.schema import Message, Task, TaskResult
|
||||
from metagpt.utils.common import CodeParser, write_json_file
|
||||
|
||||
EXTRACT_SCORE_PROMPT = """
|
||||
# Code:
|
||||
CODE_BLOCK_RESULT = """
|
||||
## Code:
|
||||
{code}
|
||||
|
||||
# Execution Result:
|
||||
## Execution Result:
|
||||
{result}
|
||||
"""
|
||||
|
||||
EXTRACT_SCORE_PROMPT = """
|
||||
# Code Blocks
|
||||
{code_block}
|
||||
# Instruction:
|
||||
Based on the code and execution result, please extract the scores and return it as a dictionary.
|
||||
Based on the code and execution result, please extract the **final scores** and return it as a dictionary.
|
||||
If you cannot find the scores, please still return a dictionary with the keys 'train_score', 'dev_score', and 'test_score', and set the values to -1.
|
||||
|
||||
# Format:
|
||||
|
|
@ -109,9 +113,17 @@ class ResearchAssistant(DataInterpreter):
|
|||
return score_dict
|
||||
|
||||
async def llm_extract_score(self):
|
||||
result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
|
||||
code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
|
||||
rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code=code_text, result=result_text, role="user"))
|
||||
# result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
|
||||
# code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
|
||||
num_tasks = len(self.planner.plan.task_map)
|
||||
task_map = self.planner.plan.task_map
|
||||
code_block = "\n".join(
|
||||
[
|
||||
CODE_BLOCK_RESULT.format(code=task_map[str(i + 1)].code, result=task_map[str(i + 1)].result)
|
||||
for i in range(num_tasks)
|
||||
]
|
||||
)
|
||||
rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code_block=code_block, role="user"))
|
||||
json_block = CodeParser.parse_code(block=None, text=rsp)
|
||||
score_dict = json.loads(json_block)
|
||||
return score_dict
|
||||
|
|
@ -161,7 +173,7 @@ class ResearchAssistant(DataInterpreter):
|
|||
stg_path = self.role_dir
|
||||
name = self.get_node_name()
|
||||
role_path = os.path.join(stg_path, f"{name}.json")
|
||||
# 将状态保存为 JSON 文件
|
||||
# save state as json file
|
||||
write_json_file(role_path, self.model_dump())
|
||||
|
||||
def remap_tasks(self):
|
||||
|
|
|
|||
|
|
@ -31,10 +31,16 @@ def get_mcts_args(parser):
|
|||
parser.set_defaults(load_tree=False)
|
||||
parser.add_argument("--rollouts", type=int, default=5)
|
||||
parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")
|
||||
parser.set_defaults(use_fixed_insights=False)
|
||||
parser.add_argument("--start_task_id", type=int, default=2)
|
||||
parser.add_argument(
|
||||
"--from_scratch", dest="from_scratch", action="store_true", help="Generate solutions from scratch"
|
||||
)
|
||||
parser.set_defaults(from_scratch=False)
|
||||
parser.add_argument("--no_external_eval", dest="external_eval", action="store_false")
|
||||
parser.set_defaults(external_eval=True)
|
||||
parser.add_argument("--eval_func", type=str, default="sela", choices=["sela", "mlebench"])
|
||||
parser.add_argument("--custom_dataset_dir", type=str, default=None)
|
||||
|
||||
|
||||
def get_aug_exp_args(parser):
|
||||
|
|
|
|||
|
|
@ -51,6 +51,8 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"):
|
|||
f"Dataset {task_name} not found in config file. Available datasets: {data_config['datasets'].keys()}"
|
||||
)
|
||||
exp_pool_path = os.path.join(data_path, f"{pool_name}.json")
|
||||
if not os.path.exists(exp_pool_path):
|
||||
return None
|
||||
return exp_pool_path
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue