mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-15 11:02:36 +02:00
Merge branch 'experimenter' into 'expo'
增加dataset_info到提示词 See merge request agents/exp_optimizer!4
This commit is contained in:
commit
3583e2af63
6 changed files with 47 additions and 186 deletions
13
expo/MCTS.py
13
expo/MCTS.py
|
|
@ -3,6 +3,7 @@ import os
|
|||
import pickle
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from expo.dataset import generate_task_requirement, get_split_dataset_path
|
||||
|
|
@ -209,16 +210,20 @@ class Node:
|
|||
await role.run(with_message="continue")
|
||||
else:
|
||||
await role.run(with_message=self.state["requirement"])
|
||||
score_dict = await role.get_score()
|
||||
score_dict = self.evaluate_simulation(score_dict)
|
||||
self.raw_reward = score_dict
|
||||
run_finished = True
|
||||
except Exception as e:
|
||||
mcts_logger.log("MCTS", f"Error in running the role: {e}")
|
||||
num_runs += 1
|
||||
if not run_finished:
|
||||
mcts_logger.log("MCTS", f"Role {role.node_id} failed to run")
|
||||
return {"test_score": 0, "dev_score": 0, "score": 0}
|
||||
score_dict = await role.get_score()
|
||||
score_dict = self.evaluate_simulation(score_dict)
|
||||
self.raw_reward = score_dict
|
||||
if self.state["low_is_better"]:
|
||||
score_dict = {"test_score": np.inf, "dev_score": np.inf, "score": np.inf}
|
||||
else:
|
||||
score_dict = {"test_score": 0, "dev_score": 0, "score": 0}
|
||||
self.raw_reward = score_dict
|
||||
if self.state["low_is_better"]:
|
||||
# normalized the score to be between 0 and 1, and higher is better
|
||||
def normalize_score(score):
|
||||
|
|
|
|||
|
|
@ -33,15 +33,17 @@ ### Budget
|
|||
实验轮次 k = 10, 20
|
||||
|
||||
|
||||
### 提示词使用
|
||||
### Prompt Usage
|
||||
|
||||
- 通过执行`dataset.py`中的`generate_task_requirement`函数获取提示词
|
||||
- 每一个数据集里有`dataset_info.json`,里面的内容需要提供给baselines以保证公平
|
||||
- 非DI-based方法设置`is_di=False`
|
||||
- `data_config`用`utils.DATA_CONFIG`
|
||||
- 每一个数据集里有`dataset_info.json`,里面的内容需要提供给baselines以保证公平(`generate_task_requirement`已经默认提供)
|
||||
|
||||
|
||||
## 3. Evaluation
|
||||
|
||||
运行各个框架,运行后框架需要提供Dev和Test的`dev_predictions.csv`和`test_predictions.csv`, column name为target
|
||||
运行各个框架,运行后框架需要提供Dev和Test的`dev_predictions.csv`和`test_predictions.csv`,每个csv文件只需要单个名为target的列
|
||||
|
||||
- 使用`CustomExperimenter`
|
||||
```
|
||||
|
|
@ -62,11 +64,6 @@ #### Setup
|
|||
```
|
||||
pip install -U pip
|
||||
pip install -U setuptools wheel
|
||||
|
||||
CPU version of pytorch has smaller footprint - see installation instructions in
|
||||
pytorch documentation - https://pytorch.org/get-started/locally/
|
||||
pip install torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
pip install autogluon
|
||||
```
|
||||
|
||||
|
|
@ -105,11 +102,11 @@ #### Setup
|
|||
|
||||
#### Run
|
||||
|
||||
- `python run_experiment.py --exp_mode mcts --task titanic --rollout 5`
|
||||
- `python run_experiment.py --exp_mode mcts --task titanic --rollout 10`
|
||||
|
||||
If the dataset has reg metric, remember to use `--low_is_better`:
|
||||
|
||||
- `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 5 --low_is_better`
|
||||
- `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 10 --low_is_better`
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -16,9 +16,8 @@ Perform data analysis, data preprocessing, feature engineering, and modeling to
|
|||
Report {metric} on the eval data. Do not plot or make any visualizations.
|
||||
"""
|
||||
|
||||
TASK_PROMPT = """\
|
||||
# User requirement
|
||||
{user_requirement}
|
||||
|
||||
DI_INSTRUCTION = """\
|
||||
**Attention**
|
||||
1. Please do not leak the target label in any form during training.
|
||||
2. Dev and Test sets do not have the target column.
|
||||
|
|
@ -39,14 +38,19 @@ Print the training set performance in the last step. Write in this format:
|
|||
print("Train score:", train_score)
|
||||
```
|
||||
|
||||
# Output dir
|
||||
{output_dir}
|
||||
"""
|
||||
|
||||
TASK_PROMPT = """\
|
||||
# User requirement
|
||||
{user_requirement}
|
||||
{additional_instruction}
|
||||
# Data dir
|
||||
training (with labels): {train_path}
|
||||
dev (without labels): {dev_path}
|
||||
testing (without labels): {test_path}
|
||||
|
||||
# Output dir
|
||||
{output_dir}
|
||||
|
||||
dataset description: {data_info_path} (You can use this file to get additional information about the dataset)
|
||||
"""
|
||||
|
||||
|
||||
|
|
@ -132,7 +136,12 @@ def create_dataset_dict(dataset):
|
|||
return dataset_dict
|
||||
|
||||
|
||||
def generate_task_requirement(task_name, data_config):
|
||||
def generate_di_instruction(output_dir):
|
||||
additional_instruction = DI_INSTRUCTION.format(output_dir=output_dir)
|
||||
return additional_instruction
|
||||
|
||||
|
||||
def generate_task_requirement(task_name, data_config, is_di=True):
|
||||
user_requirement = get_user_requirement(task_name, data_config)
|
||||
split_dataset_path = get_split_dataset_path(task_name, data_config)
|
||||
train_path = split_dataset_path["train"]
|
||||
|
|
@ -140,12 +149,19 @@ def generate_task_requirement(task_name, data_config):
|
|||
test_path = split_dataset_path["test_wo_target"]
|
||||
work_dir = data_config["work_dir"]
|
||||
output_dir = f"{work_dir}/{task_name}"
|
||||
datasets_dir = data_config["datasets_dir"]
|
||||
data_info_path = f"{datasets_dir}/{task_name}/dataset_info.json"
|
||||
if is_di:
|
||||
additional_instruction = generate_di_instruction(output_dir)
|
||||
else:
|
||||
additional_instruction = ""
|
||||
user_requirement = TASK_PROMPT.format(
|
||||
user_requirement=user_requirement,
|
||||
train_path=train_path,
|
||||
dev_path=dev_path,
|
||||
test_path=test_path,
|
||||
output_dir=output_dir,
|
||||
additional_instruction=additional_instruction,
|
||||
data_info_path=data_info_path,
|
||||
)
|
||||
print(user_requirement)
|
||||
return user_requirement
|
||||
|
|
|
|||
|
|
@ -54,11 +54,15 @@ class Experimenter:
|
|||
{"idx": i, "score_dict": score_dict, "user_requirement": user_requirement, "args": vars(self.args)}
|
||||
)
|
||||
self.save_result(results) # save intermediate results
|
||||
dev_scores = [result["score_dict"]["dev_score"] for result in results]
|
||||
dev_scores = [
|
||||
result["score_dict"]["dev_score"] for result in results if result["score_dict"]["dev_score"] != -1
|
||||
]
|
||||
best_dev_score = max(dev_scores) if not self.args.low_is_better else min(dev_scores)
|
||||
best_score_idx = dev_scores.index(best_dev_score)
|
||||
|
||||
test_scores = [result["score_dict"]["test_score"] for result in results]
|
||||
test_scores = [
|
||||
result["score_dict"]["test_score"] for result in results if result["score_dict"]["dev_score"] != -1
|
||||
]
|
||||
avg_score = sum(test_scores) / len(test_scores)
|
||||
global_best_score = max(test_scores) if not self.args.low_is_better else min(test_scores)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,103 +0,0 @@
|
|||
import argparse
|
||||
import asyncio
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from expo.dataset import generate_task_requirement
|
||||
from expo.evaluation.evaluation import evaluate_score
|
||||
from expo.insights.instruction_generator import InstructionGenerator
|
||||
from expo.MCTS import create_initial_state
|
||||
from expo.research_assistant import ResearchAssistant
|
||||
from expo.utils import DATA_CONFIG, get_exp_pool_path
|
||||
|
||||
EXPS_PROMPT = """
|
||||
When doing the tasks, you can refer to the insights below:
|
||||
{experience}
|
||||
|
||||
"""
|
||||
data_config = DATA_CONFIG
|
||||
|
||||
|
||||
def evaluate_test(score, state):
|
||||
datetime_text = datetime.datetime.now().strftime("%Y%m%d%H%M")
|
||||
task_name = state["task"]
|
||||
prediction_fpath = os.path.join(state["work_dir"], task_name, "predictions.csv")
|
||||
predictions = pd.read_csv(prediction_fpath)["target"]
|
||||
# copy predictions.csv to the node_dir
|
||||
|
||||
predictions_node_fpath = os.path.join("results", f"{task_name}-{datetime_text}-predictions.csv")
|
||||
predictions.to_csv(predictions_node_fpath, index=False)
|
||||
# load test_target.csv
|
||||
split_datasets_dir = state["datasets_dir"]
|
||||
gt = pd.read_csv(os.path.join(split_datasets_dir["test_target"]))["target"]
|
||||
metric = state["dataset_config"]["metric"]
|
||||
score["test_score"] = evaluate_score(predictions, gt, metric)
|
||||
return score
|
||||
|
||||
|
||||
async def main(task_name, use_reflection=True, mode="single", num_experiments=2):
|
||||
"""
|
||||
mode: single or set
|
||||
single: sample one instruction
|
||||
set: sample a set of instructions
|
||||
"""
|
||||
low_is_better = False
|
||||
state = create_initial_state(
|
||||
task_name, start_task_id=1, data_config=data_config, low_is_better=low_is_better, name=""
|
||||
)
|
||||
|
||||
user_requirement = generate_task_requirement(task_name, data_config)
|
||||
exp_pool_path = get_exp_pool_path(task_name, data_config, pool_name="ds_analysis_pool")
|
||||
exp_pool = InstructionGenerator.load_analysis_pool(exp_pool_path)
|
||||
if mode == "single":
|
||||
exps = InstructionGenerator._random_sample(exp_pool, num_experiments)
|
||||
exps = [exp["Analysis"] for exp in exps]
|
||||
elif mode == "set":
|
||||
exp_set = InstructionGenerator.sample_instruction_set(exp_pool)
|
||||
exp_set_text = "\n".join([f"{exp['task_id']}: {exp['Analysis']}" for exp in exp_set])
|
||||
exps = [exp_set_text] * num_experiments
|
||||
else:
|
||||
raise ValueError(f"Invalid mode: {mode}")
|
||||
|
||||
scores = []
|
||||
for i in range(num_experiments):
|
||||
di = ResearchAssistant(node_id=str(i), use_reflection=use_reflection)
|
||||
di.role_dir = f"{di.role_dir}_{task_name}"
|
||||
requirement = user_requirement + EXPS_PROMPT.format(experience=exps[i])
|
||||
print(requirement)
|
||||
await di.run(requirement)
|
||||
score = await di.get_score(low_is_better=False)
|
||||
score = evaluate_test(score, state)
|
||||
|
||||
scores.append(score)
|
||||
|
||||
with open(f"results/{task_name}_scores.json", "w") as f:
|
||||
# save scores and corresponding insights
|
||||
results = {
|
||||
"avg_score": sum([score["test_score"] for score in scores if score]) / num_experiments,
|
||||
"max_score": max([score["test_score"] for score in scores]),
|
||||
"scores": scores,
|
||||
"insights": exps,
|
||||
}
|
||||
json.dump(results, f, indent=4)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--task", type=str, default="titanic")
|
||||
parser.add_argument("--use_reflection", dest="use_reflection", action="store_true")
|
||||
parser.add_argument("--no_use_reflection", dest="use_reflection", action="store_false")
|
||||
parser.set_defaults(use_reflection=True)
|
||||
parser.add_argument("--mode", type=str, default="single")
|
||||
parser.add_argument("--num_experiments", type=int, default=2)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
asyncio.run(
|
||||
main(args.task, use_reflection=args.use_reflection, mode=args.mode, num_experiments=args.num_experiments)
|
||||
)
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
import argparse
|
||||
import asyncio
|
||||
|
||||
from expo.evaluation.visualize_mcts import get_tree_text
|
||||
from expo.MCTS import MCTS
|
||||
from expo.utils import load_data_config
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--task", type=str, default="titanic")
|
||||
parser.add_argument("--low_is_better", dest="low_is_better", action="store_true")
|
||||
parser.set_defaults(low_is_better=False)
|
||||
parser.add_argument("--load_tree", dest="load_tree", action="store_true")
|
||||
parser.add_argument("--no_load_tree", dest="load_tree", action="store_false")
|
||||
parser.set_defaults(load_tree=True)
|
||||
parser.add_argument("--reflection", dest="reflection", action="store_true")
|
||||
parser.add_argument("--no_reflection", dest="reflection", action="store_false")
|
||||
parser.set_defaults(reflection=True)
|
||||
parser.add_argument("--rollouts", type=int, default=3)
|
||||
parser.add_argument("--name", type=str, default="")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
data_config = load_data_config()
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = get_args()
|
||||
# requirement = generate_task_requirement(args.task, data_config)
|
||||
# print(requirement)
|
||||
|
||||
# role, root_node = initialize_di_root_node(requirement, data_config)
|
||||
# asyncio.run(role.run(requirement))
|
||||
|
||||
# asyncio.run(root_node.run_node())
|
||||
mcts = MCTS(root_node=None, max_depth=5)
|
||||
best_nodes = asyncio.run(
|
||||
mcts.search(
|
||||
args.task,
|
||||
data_config,
|
||||
low_is_better=args.low_is_better,
|
||||
load_tree=args.load_tree,
|
||||
reflection=args.reflection,
|
||||
rollouts=args.rollouts,
|
||||
name=args.name,
|
||||
)
|
||||
)
|
||||
best_node = best_nodes["global_best"]
|
||||
dev_best_node = best_nodes["dev_best"]
|
||||
text, num_generated_codes = get_tree_text(mcts.root_node)
|
||||
print(text)
|
||||
print(f"Generated {num_generated_codes} unique codes.")
|
||||
|
||||
with open(f"results/{args.task}_tree{args.name}.txt", "w") as f:
|
||||
f.write(f"Generated {num_generated_codes} unique codes.\n")
|
||||
f.write(f"Best node: {best_node}, score: {best_node.raw_reward}\n")
|
||||
f.write(f"Dev best node: {dev_best_node}, score: {dev_best_node.raw_reward}\n")
|
||||
f.write(text)
|
||||
Loading…
Add table
Add a link
Reference in a new issue