Merge branch 'insight-pool-improve' into 'expo'

add support to run mle-bench

See merge request agents/exp_optimizer!25
This commit is contained in:
林义章 2024-10-18 07:42:44 +00:00
commit 2ce186c226
17 changed files with 624 additions and 117 deletions

View file

@ -3,10 +3,12 @@ import math
import os
import pickle
import random
import shutil
import numpy as np
import pandas as pd
from expo.data.custom_task import get_mle_bench_requirements, get_mle_task_id
from expo.data.dataset import generate_task_requirement, get_split_dataset_path
from expo.evaluation.evaluation import evaluate_score
from expo.insights.instruction_generator import InstructionGenerator
@ -17,32 +19,52 @@ from metagpt.utils.common import read_json_file
def initialize_di_root_node(state, reflection: bool = True):
# state = create_initial_state(
# task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
# )
role = ResearchAssistant(
node_id="0", start_task_id=state["start_task_id"], use_reflection=reflection, role_dir=state["node_dir"]
node_id="0",
start_task_id=state["start_task_id"],
use_reflection=reflection,
role_dir=state["node_dir"],
role_timeout=state["role_timeout"],
)
return role, Node(parent=None, state=state, action=None, value=0)
def create_initial_state(
task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str, args
):
def create_initial_state(task, start_task_id, data_config, args):
external_eval = args.external_eval
if args.custom_dataset_dir:
dataset_config = None
datasets_dir = args.custom_dataset_dir
requirement = get_mle_bench_requirements(
args.custom_dataset_dir, data_config, special_instruction=args.special_instruction
)
exp_pool_path = None
# external_eval = False # make sure external eval is false if custom dataset is used
task = get_mle_task_id(args.custom_dataset_dir)
else:
dataset_config = data_config["datasets"][task]
if dataset_config["metric"] == "rmse":
args.low_is_better = True
datasets_dir = get_split_dataset_path(task, data_config)
requirement = generate_task_requirement(
task, data_config, is_di=True, special_instruction=args.special_instruction
)
exp_pool_path = get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool")
initial_state = {
"task": task,
"work_dir": data_config["work_dir"],
"node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{name}"),
"dataset_config": data_config["datasets"][task],
"datasets_dir": get_split_dataset_path(task, data_config),
"exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"),
"requirement": generate_task_requirement(
task, data_config, is_di=True, special_instruction=special_instruction
),
"node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{args.name}"),
"dataset_config": dataset_config,
"datasets_dir": datasets_dir, # won't be used if external eval is used
"exp_pool_path": exp_pool_path,
"requirement": requirement,
"has_run": False,
"start_task_id": start_task_id,
"low_is_better": low_is_better,
"low_is_better": args.low_is_better,
"role_timeout": args.role_timeout,
"external_eval": external_eval,
"custom_dataset_dir": args.custom_dataset_dir,
}
os.makedirs(initial_state["node_dir"], exist_ok=True)
return initial_state
@ -108,7 +130,7 @@ class Node:
return f"{self.parent.id}-{num_sibling}"
def is_terminal(self):
return int(self.state["start_task_id"]) == self.max_depth + 1
return int(self.state["start_task_id"]) == self.max_depth + 1 # TODO: Check if this is correct or +1
def is_fully_expanded(self):
return len(self.children) > 0
@ -173,22 +195,34 @@ class Node:
node.save_new_role(new_role)
self.add_child(node)
def evaluate_prediction(self, split):
pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
pred_node_path = os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
def get_predictions_path(self, split):
return os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
def get_and_move_predictions(self, split):
if not os.path.exists(self.get_predictions_path(split)):
pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
shutil.copy(pred_path, self.get_predictions_path(split))
os.remove(pred_path)
return pd.read_csv(self.get_predictions_path(split))
def get_gt(self, split):
gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
preds = pd.read_csv(pred_path)["target"]
preds.to_csv(pred_node_path, index=False)
gt = pd.read_csv(gt_path)["target"]
return pd.read_csv(gt_path)
def evaluate_prediction(self, split):
preds = self.get_and_move_predictions(split)["target"]
gt = self.get_gt(split)["target"]
metric = self.state["dataset_config"]["metric"]
# remove original predictions.csv
os.remove(pred_path)
return evaluate_score(preds, gt, metric)
def evaluate_simulation(self, score_dict):
scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
scores["score"] = scores["dev_score"]
score_dict.update(scores)
if self.state["external_eval"]: # use external evaluation
scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")}
scores["score"] = scores["dev_score"]
score_dict.update(scores)
else:
self.get_and_move_predictions("dev")
self.get_and_move_predictions("test")
return score_dict
async def run_node(self, role=None):
@ -215,7 +249,6 @@ class Node:
mcts_logger.log("MCTS", f"Role-level timeout: {e}")
break
except Exception as e:
print(f"Error: {e}")
mcts_logger.log("MCTS", f"Error in running the role: {e}")
num_runs += 1
@ -235,14 +268,15 @@ class Node:
score_dict = {k: normalize_score(v) for k, v in score_dict.items()}
self.normalized_reward = score_dict
return score_dict
result_dict = role.get_solution()
return score_dict, result_dict
class MCTS:
# data_path
root_node: Node = None
children: dict = {}
max_depth: int = 5
max_depth: int = None
c_explore: float = 1.4
c_unvisited: float = 0.8
node_order: list = []
@ -281,9 +315,9 @@ class MCTS:
mcts_logger.log("MCTS", f"Start simulating node {node.id}:")
while node.children:
node = random.choice(node.children)
reward = await node.run_node(role)
reward, result_dict = await node.run_node(role)
mcts_logger.log("MCTS", f"Simulated node's reward: {reward}")
# TODO: add new insights
return reward
def backpropagate(self, node: Node, reward):
@ -341,12 +375,17 @@ class MCTS:
scores["test_raw"].append(node.raw_reward["test_score"])
return scores
async def search(self, state, rollouts, load_tree=False, reflection=False):
async def search(self, state, args):
reflection = args.reflection
load_tree = args.load_tree
rollouts = args.rollouts
from_scratch = args.from_scratch
role, root = initialize_di_root_node(state, reflection=reflection)
self.root_node = root
self.instruction_generator = InstructionGenerator(
file_path=state["exp_pool_path"], use_fixed_insights=self.use_fixed_insights
state=state, use_fixed_insights=self.use_fixed_insights, from_scratch=from_scratch
)
await self.instruction_generator.initialize()
tree_loaded = False
if load_tree:

View file

@ -6,7 +6,12 @@ # SELA: Tree-Search Enhanced LLM Agents for Automated Machine Learning
## 1. Data Preparation
- Download Datasetshttps://deepwisdom.feishu.cn/drive/folder/RVyofv9cvlvtxKdddt2cyn3BnTc?from=from_copylink
- Download and prepare datasets from scratch:
```
cd expo/data
python dataset.py --save_analysis_pool
python hf_data.py --save_analysis_pool
```
## 2. Configs
@ -60,7 +65,7 @@ #### Run
If the dataset has reg metric, remember to use `--low_is_better`:
- `python run_experiment.py --exp_mode mcts --task house_prices --rollouts 10 --low_is_better`
- `python run_experiment.py --exp_mode mcts --task house-prices --rollouts 10 --low_is_better`
In addition to the generated insights, include the fixed insights saved in `expo/insights/fixed_insights.json`
@ -85,6 +90,23 @@ ## 4. Evaluation
- Use the function `evaluate_score` to evaluate.
#### MLE-Bench
**Note: mle-bench requires python 3.11 or higher**
```
git clone https://github.com/openai/mle-bench.git
cd mle-bench
pip install -e .
```
```
mlebench prepare -c <competition-id> --data-dir <dataset-dir-save-path>
```
Enter the following command to run the experiment:
```
python run_experiment.py --exp_mode mcts --custom_dataset_dir <dataset-dir-save-path/prepared/public> --rollouts 10 --from_scratch --role_timeout 3600
```
## 5. Baselines
### DS Agent
@ -92,7 +114,7 @@ ### DS Agent
git clone https://github.com/guosyjlu/DS-Agent.git
```
将其deployment/generate.py line46-48行部分修改如下目的是用deepseek而非GPT的API
Modify the following lines in deployment/generate.py (lines 46-48) as shown below (the purpose is to use deepseek instead of OpenAI's API):
```python
messages = [{"role": "user", "content": prompt}]
@ -120,7 +142,7 @@ ### DS Agent
completion = raw_completion.split("```python")[1].split("```")[0]
```
修改完后在新建一个`deployment/test.sh` 分别运行下列两行,`$TASK` 是你要测试的task name
After making the changes, create a new `deployment/test.sh` and run the following two lines separately, where `$TASK` is the name of the task you want to test
```
python -u generate.py --llm deepseek-coder --task $TASK --shot 1 --retrieval > "$TASK".txt 2>&1
@ -135,7 +157,7 @@ #### Setup
git clone https://github.com/WecoAI/aideml.git
```
修改 `aideml/aide/utils/config.yaml` 其中的 `step` `k_fold_validation` `code model` `feedback model` 参数如下
Modify `aideml/aide/utils/config.yaml` - change `k_fold_validation`, `code model`, and `feedback model` as follows:
```yaml
# agent hyperparams
@ -153,16 +175,22 @@ # LLM settings for evaluating program output / tracebacks
feedback:
model: deepseek-coder
temp: 0.5
# hyperparameters for the tree search
search:
max_debug_depth: 3
debug_prob: 0.5
num_drafts: 5
```
由于 deepseek 完全兼容 OpenAI 的 API修改`base_url``自己的url``api_key``自己的key`即可
Since Deepseek is compatible to OpenAI's API, change `base_url` into `your own url``api_key` into `your api key`
```
export OPENAI_API_KEY="自己的key"
export OPENAI_BASE_URL="自己的url"
export OPENAI_API_KEY="your api key"
export OPENAI_BASE_URL="your own url"
```
修改`aideml/aide/backend/__init__.py` 30 行内容如下:
Modify `aideml/aide/backend/__init__.py`'s line 30 and below:
```python
model_kwargs = model_kwargs | {
@ -176,7 +204,7 @@ # LLM settings for evaluating program output / tracebacks
query_func = backend_openai.query
```
由于 deepseekV2.5 不再支持 system message 使用 function call修改 `aideml/aide/agent.py` 312 行内容如下:
Since deepseekV2.5 no longer supports system message using function call, modify `aideml/aide/agent.py`'s line 312:
```python
response = cast(
@ -191,7 +219,7 @@ # LLM settings for evaluating program output / tracebacks
)
```
修改完后
Modify and install:
```
cd aideml
@ -200,8 +228,8 @@ # LLM settings for evaluating program output / tracebacks
#### Run
运行下面脚本获取运行结果,在当前目录下将生成一个 log 文件夹以及 workspace 文件夹
log 文件夹中将包含实验使用配置以及生成方案记录workspace 文件夹下将保存 aide 最后生成的结果文件
Run the following script to get the running results, a `log` folder and a `workspace` folder will be generated in the current directory
The `log` folder will contain the experimental configuration and the generated scheme, and the `workspace` folder will save the final results generated by aide
```
python experimenter/aide.py
@ -227,7 +255,6 @@ #### Setup
Replace {task_name} with the specific task you want to run.
提供github链接并说明使用的命令以及参数设置
### AutoSklearn
#### System requirements
auto-sklearn has the following system requirements:
@ -258,15 +285,4 @@ ### Base DI
For setup, check 4.
- `python run_experiment.py --exp_mode base --task titanic --num_experiments 10`
- Specifically instruct DI to use AutoGluon: `--special_instruction ag`
- Specifically instruct DI to use the stacking ensemble method: `--special_instruction stacking`
- Specifically instruct DI to use the stacking ensemble method: `--special_instruction stacking`

74
expo/data/custom_task.py Normal file
View file

@ -0,0 +1,74 @@
import os
from expo.data.dataset import SPECIAL_INSTRUCTIONS
from expo.experimenter.mle_bench.instructions import (
ADDITIONAL_NOTES,
INSTRUCTIONS,
INSTRUCTIONS_OBFUSCATED,
)
MLE_BENCH_FILES = ["description.md", "description_obfuscated.md"]
MLE_REQUIREMENTS = """
{instructions}
{additonal_notes}
COMPETITION INSTRUCTIONS
------
{task_description}
## More Instructions
- output_dir: {output_dir}
- Besides `submission.csv`, you should also save your output in the output directory.
- You should split the training data into train and dev set.
- You should use the dev set to improve your model. Print the final dev set score after training.
- Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. They should be in the same format as the `submission.csv`.
- Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. {special_instruction}
**Do not make any plots or visualizations.**
"""
def get_mle_task_id(dataset_dir):
return dataset_dir.split("/")[-3]
def get_mle_is_lower_better(task):
from mlebench.data import get_leaderboard
from mlebench.registry import registry
competition = registry.get_competition(task)
competition_leaderboard = get_leaderboard(competition)
return competition.grader.is_lower_better(competition_leaderboard)
def get_mle_bench_requirements(dataset_dir, data_config, special_instruction, obfuscated=False):
work_dir = data_config["work_dir"]
task = get_mle_task_id(dataset_dir)
output_dir = f"{work_dir}/{task}"
final_output_dir = f"{work_dir}/submission"
os.makedirs(output_dir, exist_ok=True)
if special_instruction:
special_instruction = SPECIAL_INSTRUCTIONS[special_instruction]
else:
special_instruction = ""
if obfuscated:
instructions = INSTRUCTIONS_OBFUSCATED.format(dataset_dir=dataset_dir, output_dir=final_output_dir)
task_file = "description_obfuscated.md"
else:
instructions = INSTRUCTIONS.format(dataset_dir=dataset_dir, output_dir=output_dir)
task_file = "description.md"
with open(os.path.join(dataset_dir, task_file), encoding="utf-8") as f:
task_description = f.read()
mle_requirement = MLE_REQUIREMENTS.format(
instructions=instructions,
additonal_notes=ADDITIONAL_NOTES,
task_description=task_description,
output_dir=output_dir,
special_instruction=special_instruction,
)
print(mle_requirement)
return mle_requirement

View file

@ -1,3 +1,4 @@
import argparse
import asyncio
import json
import os
@ -18,22 +19,22 @@ Report {metric} on the eval data. Do not plot or make any visualizations.
"""
USE_AG = """
7. Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.
- Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.
"""
TEXT_MODALITY = """
7. You could use models from transformers library for this text dataset.
8. Use gpu if available for faster training.
- You could use models from transformers library for this text dataset.
- Use gpu if available for faster training.
"""
IMAGE_MODALITY = """
7. You could use models from transformers/torchvision library for this image dataset.
8. Use gpu if available for faster training.
- You could use models from transformers/torchvision library for this image dataset.
- Use gpu if available for faster training.
"""
STACKING = """
7. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor.
8. You could do some quick model prototyping to see which models work best and then use them in the ensemble.
- To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor.
- You could do some quick model prototyping to see which models work best and then use them in the ensemble.
"""
@ -268,7 +269,7 @@ class ExpDataset:
dataset_info = self.get_dataset_info()
num_classes = dataset_info["metadata"]["NumberOfClasses"]
if num_classes == 2:
metric = "f1"
metric = "f1 binary"
elif 2 < num_classes <= 200:
metric = "f1 weighted"
elif num_classes > 200 or num_classes == 0:
@ -361,10 +362,22 @@ async def process_dataset(dataset, solution_designer: SolutionDesigner, save_ana
datasets_dict["datasets"][dataset.name] = dataset_dict
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--force_update", action="store_true", help="Force update datasets")
parser.add_argument("--save_analysis_pool", action="store_true", help="Save analysis pool")
parser.add_argument(
"--no_save_analysis_pool", dest="save_analysis_pool", action="store_false", help="Do not save analysis pool"
)
parser.set_defaults(save_analysis_pool=True)
return parser.parse_args()
if __name__ == "__main__":
datasets_dir = DATA_CONFIG["datasets_dir"]
force_update = False
save_analysis_pool = True
args = parse_args()
force_update = args.force_update
save_analysis_pool = args.save_analysis_pool
datasets_dict = {"datasets": {}}
solution_designer = SolutionDesigner()
for dataset_id in OPENML_DATASET_IDS:

View file

@ -7,7 +7,12 @@ import pandas as pd
from datasets import load_dataset
from PIL import Image
from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml
from expo.data.dataset import (
ExpDataset,
parse_args,
process_dataset,
save_datasets_dict_to_yaml,
)
from expo.insights.solution_designer import SolutionDesigner
from expo.utils import DATA_CONFIG
@ -116,8 +121,9 @@ class HFExpDataset(ExpDataset):
if __name__ == "__main__":
dataset_dir = DATA_CONFIG["datasets_dir"]
save_analysis_pool = True
force_update = False
args = parse_args()
force_update = args.force_update
save_analysis_pool = args.save_analysis_pool
datasets_dict = {"datasets": {}}
solution_designer = SolutionDesigner()
for dataset_meta in HFDATSETS:

View file

@ -1,3 +1,5 @@
from pathlib import Path
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, roc_auc_score
@ -22,3 +24,26 @@ def evaluate_score(pred, gt, metric):
return mean_squared_error(np.log1p(gt), np.log1p(pred), squared=False)
else:
raise ValueError(f"Metric {metric} not supported")
def node_evaluate_score_sela(node):
preds = node.get_and_move_predictions("test")["target"]
gt = node.get_gt("test")["target"]
metric = node.state["dataset_config"]["metric"]
return evaluate_score(preds, gt, metric)
def node_evaluate_score_mlebench(node):
# TODO
from mlebench.grade import grade_csv
from mlebench.registry import registry
competition_id = node.state["task"]
data_dir = Path(node.state["custom_dataset_dir"]).parent.parent.parent # prepared/public/../../../
pred_path = node.get_predictions_path("test")
new_registry = registry.set_data_dir(data_dir)
competition = new_registry.get_competition(competition_id)
submission = Path(pred_path)
report = grade_csv(submission, competition).to_dict()
report["submission_path"] = str(submission)
return report

View file

@ -1,5 +1,8 @@
import textwrap
import matplotlib.pyplot as plt
import networkx as nx
from expo.MCTS import Node
NODE_TEMPLATE = """\
@ -11,6 +14,9 @@ Score: {score}, Visits: {num_visits}
"""
NODE_SIZE = 12000
NODE_FONT_SIZE = 18
def get_role_plans(role):
plans = role.planner.plan.tasks
@ -42,7 +48,7 @@ def get_tree_text(node: Node):
id=node_id, plans=instruct_plans_text, simulated=simulated, score=score, num_visits=num_visits
)
def visualize_tree(node, depth=0, previous_plans=None):
def visualize_tree_text(node, depth=0, previous_plans=None):
text = ""
if node is not None:
text += visualize_node(node, previous_plans)
@ -50,10 +56,107 @@ def get_tree_text(node: Node):
code_set.update({task.instruction for task in role.planner.plan.tasks})
previous_plans = get_role_plans(role)
for child in node.children:
text += textwrap.indent(visualize_tree(child, depth + 1, previous_plans), "\t")
text += textwrap.indent(visualize_tree_text(child, depth + 1, previous_plans), "\t")
return text
num_simulations = node.visited
text = f"Number of simulations: {num_simulations}\n"
text += visualize_tree(node)
text += visualize_tree_text(node)
return text, len(code_set)
def get_node_color(node):
if node["visits"] == 0:
return "#D3D3D3"
else:
# The higher the avg_value, the more intense the color
# avg_value is between 0 and 1
avg_value = node["avg_value"]
# Convert avg_value to a color ranging from red (low) to green (high)
red = int(255 * (1 - avg_value))
green = int(255 * avg_value)
return f"#{red:02X}{green:02X}00"
def visualize_tree(graph, show_instructions=False, save_path=""):
# Use a hierarchical layout for tree-like visualization
pos = nx.spring_layout(graph, k=0.9, iterations=50)
plt.figure(figsize=(30, 20)) # Further increase figure size for better visibility
# Calculate node levels
root = "0"
levels = nx.single_source_shortest_path_length(graph, root)
max_level = max(levels.values())
# Adjust y-coordinates based on levels and x-coordinates to prevent overlap
nodes_by_level = {}
for node, level in levels.items():
if level not in nodes_by_level:
nodes_by_level[level] = []
nodes_by_level[level].append(node)
for level, nodes in nodes_by_level.items():
y = 1 - level / max_level
x_step = 1.0 / (len(nodes) + 1)
for i, node in enumerate(sorted(nodes)):
pos[node] = ((i + 1) * x_step, y)
# Draw edges
nx.draw_networkx_edges(graph, pos, edge_color="gray", arrows=True, arrowsize=40, width=3)
# Draw nodes
node_colors = [get_node_color(graph.nodes[node]) for node in graph.nodes]
nx.draw_networkx_nodes(graph, pos, node_size=NODE_SIZE, node_color=node_colors)
# Add labels to nodes
labels = nx.get_node_attributes(graph, "label")
nx.draw_networkx_labels(graph, pos, labels, font_size=NODE_FONT_SIZE)
if show_instructions:
# Add instructions to the right side of nodes
instructions = nx.get_node_attributes(graph, "instruction")
for node, (x, y) in pos.items():
wrapped_text = textwrap.fill(instructions[node], width=30) # Adjust width as needed
plt.text(x + 0.05, y, wrapped_text, fontsize=15, ha="left", va="center")
plt.title("MCTS Tree Visualization", fontsize=40)
plt.axis("off") # Turn off axis
plt.tight_layout()
if save_path:
plt.savefig(save_path)
plt.show()
def build_tree_recursive(graph, parent_id, node, start_task_id=2):
"""
Recursively builds the entire tree starting from the root node.
Adds nodes and edges to the NetworkX graph.
"""
role = node.load_role()
depth = node.get_depth()
if depth == 0:
instruction = "\n\n".join([role.planner.plan.tasks[i].instruction for i in range(start_task_id)])
else:
instruction = role.planner.plan.tasks[depth + start_task_id - 1].instruction
print(instruction)
# Add the current node with attributes to the graph
dev_score = node.raw_reward.get("dev_score", 0) * 100
avg_score = node.avg_value() * 100
graph.add_node(
parent_id,
label=f"{node.id}\nAvg: {avg_score:.1f}\nScore: {dev_score:.1f}\nVisits: {node.visited}",
avg_value=node.avg_value(),
dev_score=dev_score,
visits=node.visited,
instruction=instruction,
)
# Stopping condition: if the node has no children, return
if not node.children:
return
# Recursively create all child nodes
for i, child in enumerate(node.children):
child_id = f"{parent_id}-{i}"
graph.add_edge(parent_id, child_id)
build_tree_recursive(graph, child_id, child)

View file

@ -21,9 +21,7 @@ class CustomExperimenter(Experimenter):
self.task,
start_task_id=1,
data_config=self.data_config,
low_is_better=self.low_is_better,
name=self.name,
special_instruction=self.args.special_instruction,
args=self.args,
)
def run_experiment(self):

View file

@ -24,9 +24,6 @@ class Experimenter:
self.args.task,
start_task_id=self.start_task_id,
data_config=self.data_config,
low_is_better=self.args.low_is_better,
name=self.args.name,
special_instruction=self.args.special_instruction,
args=self.args,
)
@ -43,7 +40,10 @@ class Experimenter:
except Exception as e:
print(f"Error: {e}")
num_runs += 1
save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
# save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}")
save_name = self.get_save_name()
save_notebook(role=di, save_dir=self.result_path, name=f"{save_name}_{run_idx}")
if not run_finished:
score_dict = {"train_score": -1, "dev_score": -1, "test_score": -1, "score": -1}
return score_dict

View file

@ -1,5 +1,9 @@
import shutil
from expo.evaluation.evaluation import (
node_evaluate_score_mlebench,
node_evaluate_score_sela,
)
from expo.evaluation.visualize_mcts import get_tree_text
from expo.experimenter.experimenter import Experimenter
from expo.Greedy import Greedy, Random
@ -14,30 +18,35 @@ class MCTSExperimenter(Experimenter):
self.start_task_id = 1 # start from datapreprocessing if it is image task
else:
self.start_task_id = args.start_task_id
if args.eval_func == "sela":
self.eval_func = node_evaluate_score_sela
elif args.eval_func == "mlebench":
self.eval_func = node_evaluate_score_mlebench
super().__init__(args, **kwargs)
self.tree_mode = tree_mode
async def run_experiment(self):
use_fixed_insights = self.args.use_fixed_insights
depth = self.args.max_depth
if self.tree_mode == "greedy":
mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
mcts = Greedy(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
elif self.tree_mode == "random":
mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
mcts = Random(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
else:
mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
best_nodes = await mcts.search(
state=self.state,
reflection=self.args.reflection,
rollouts=self.args.rollouts,
load_tree=self.args.load_tree,
)
mcts = MCTS(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights)
best_nodes = await mcts.search(state=self.state, args=self.args)
best_node = best_nodes["global_best"]
dev_best_node = best_nodes["dev_best"]
score_dict = best_nodes["scores"]
additional_scores = {"grader": self.eval_func(dev_best_node)}
text, num_generated_codes = get_tree_text(mcts.root_node)
text += f"Generated {num_generated_codes} unique codes.\n"
text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
text += f"Grader score: {additional_scores['grader']}\n"
print(text)
results = [
{
@ -50,6 +59,7 @@ class MCTSExperimenter(Experimenter):
"tree_text": text,
"args": vars(self.args),
"scores": score_dict,
"additional_scores": additional_scores,
}
]
self.save_result(results)

View file

@ -0,0 +1,48 @@
SUBMISSION_INSTURCTION = """
- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
"""
INSTRUCTIONS = """
You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself.
You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
BENCHMARK INSTRUCTIONS
------
First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `expo.experimenter.mle_bench.instructions.py`.
- **Competition-specific instructions**: The instructions for the competition are available in `{dataset_dir}/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
- **Dataset**: The dataset files for the competition are available in `{dataset_dir}`.
- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `{output_dir}/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `{dataset_dir}` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
- **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified.
Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task.
Wherever there is a conflict between the points above and the competition-specific instructions in `{dataset_dir}/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
"""
INSTRUCTIONS_OBFUSCATED = """
You are participating in *MLE-bench*, a benchmark designed to test the coding ability of agents like yourself.
You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
BENCHMARK INSTRUCTIONS
------
First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `expo.experimenter.mle_bench.instructions.py`.
- **Task-specific instructions**: The instructions for the task are available in `{dataset_dir}/description.md`. Read this file to understand the problem setup for the task you are currently solving.
- **Dataset**: The dataset files for the task are available in `{dataset_dir}/`.
- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `{output_dir}/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `{dataset_dir}` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
- **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified.
Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task.
Wherever there is a conflict between the points above and the task-specific instructions in `{dataset_dir}/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
"""
ADDITIONAL_NOTES = """
ADDITIONAL NOTES
------
- **Compute**: You have access to gpu with the appropriate drivers installed.
"""

View file

@ -1,6 +1,7 @@
import json
import os
import random
from difflib import SequenceMatcher
from expo.insights.solution_designer import SolutionDesigner
from expo.utils import clean_json_from_rsp, load_data_config, mcts_logger
@ -33,11 +34,30 @@ DATA_CONFIG = load_data_config()
class InstructionGenerator:
data_config = DATA_CONFIG
def __init__(self, file_path, use_fixed_insights=False):
self.file_path = file_path
def __init__(self, state, use_fixed_insights, from_scratch):
self.state = state
self.file_path = state["exp_pool_path"]
if state["custom_dataset_dir"]:
with open(f"{state['custom_dataset_dir']}/description.md", "r", encoding="utf-8") as file:
self.dataset_info = file.read()
else:
dataset_info_path = (
f"{self.data_config['datasets_dir']}/{state['dataset_config']['dataset']}/dataset_info.json"
)
with open(dataset_info_path, "r") as file:
self.dataset_info = json.load(file)
self.use_fixed_insights = use_fixed_insights
self.analysis_pool = self.load_insight_pool(file_path, use_fixed_insights)
self.proposer = SolutionDesigner()
if self.file_path is None:
self.from_scratch = True
else:
self.from_scratch = from_scratch
async def initialize(self):
if self.from_scratch:
self.insight_pool = await self.generate_solutions_from_scratch(self.dataset_info, self.state["task"])
else:
self.insight_pool = self.load_insight_pool(self.file_path, self.use_fixed_insights)
@staticmethod
def load_json_data(json_dir):
@ -84,14 +104,14 @@ class InstructionGenerator:
data.extend(fixed_insights)
for item in data:
if "task_id" not in item:
raise ValueError("task_id is not found in the analysis pool")
raise ValueError("task_id is not found in the insight_pool")
if task_id:
data = [item for item in data if int(item["task_id"]) == int(task_id)]
return data
async def generate_new_instructions(self, task_id, original_instruction, max_num, ext_info=None):
data = self.analysis_pool
data = self.insight_pool
new_instructions = []
if len(data) == 0:
mcts_logger.log("MCTS", f"No insights available for task {task_id}")
@ -108,6 +128,34 @@ class InstructionGenerator:
new_instructions.append(new_instruction)
return new_instructions
async def propose_new_insights(self, solution, score):
new_insights = await self.proposer.propose_insights(solution, score)
added_insights = self.add_insight(new_insights)
return added_insights
async def generate_solutions_from_scratch(self, dataset_info, dataset_name):
insight_pool = await self.proposer.generate_solutions(dataset_info, dataset_name, save_analysis_pool=False)
return insight_pool
def add_insight(self, new_insights):
added_insights = []
for new_insight in new_insights:
if not self.is_similar_to_existing(new_insight):
added_insights.append(new_insight)
self.insight_pool.append(new_insight)
return added_insights
def is_similar_to_existing(self, new_insight, similarity_threshold=0.8):
for existing_insight in self.insight_pool:
similarity = self.calculate_similarity(new_insight["Analysis"], existing_insight["Analysis"])
if similarity > similarity_threshold:
return True
return False
@staticmethod
def calculate_similarity(text1, text2):
return SequenceMatcher(None, text1, text2).ratio()
@staticmethod
async def generate_new_instruction(original_instruction, insights, ext_info):
prompt = CHANGE_INSTRUCTION.format(instruction=original_instruction, insights=insights)

View file

@ -5,7 +5,8 @@ from metagpt.llm import LLM
DATA_CONFIG = load_data_config()
DATASET_INSIGHT_PROMPT = """
DATASET_DESCRIPTION_SELA_PROMPT = """
# Dataset Description
{dataset}
@ -14,6 +15,15 @@ DATASET_INSIGHT_PROMPT = """
# Dataset Head
{head}
"""
DATASET_DESCRIPTION_CUSTOM_PROMPT = """
# Dataset Description
{dataset_description}
"""
DATASET_INSIGHT_PROMPT = """
{description}
# Instruction
Propose insights to help improve the performance of the model on this dataset.
@ -70,6 +80,45 @@ Your model choices should be advanced enough to be helpful.
```
"""
INSIGHT_PROPOSAL_PROMPT = """
You are an AI assistant tasked with analyzing a machine learning solution and proposing new insights to improve its performance. Given the current solution code and development score, suggest innovative approaches to enhance the model.
Current Solution Code:
{solution_code}
Development Score: {dev_score}
Based on this information, propose 3-5 new insights across different aspects of the machine learning pipeline (Data Preprocessing, Feature Engineering, and Model Training). Your insights should be specific, actionable, and have the potential to improve the model's performance.
Please format your response as a JSON array with the following structure:
[
{{
"task_type": "Data Preprocessing",
"insights": [
"insight1",
"insight2"
]
}},
{{
"task_type": "Feature Engineering",
"insights": [
"insight1",
"insight2"
]
}},
{{
"task_type": "Model Training",
"insights": [
"insight1",
"insight2"
]
}}
]
"""
KEY_DATASET_FEATURES = [
"NumberOfClasses",
"NumberOfFeatures",
@ -86,18 +135,32 @@ TASK_TO_ID = {"EDA": 1, "Data Preprocessing": 2, "Feature Engineering": 3, "Mode
class SolutionDesigner:
data_dir: str = DATA_CONFIG["datasets_dir"]
async def generate_solutions(self, dataset_info, dataset_name):
async def generate_solutions(self, dataset_info, dataset_name, save_analysis_pool=True):
llm = LLM()
context = DATASET_INSIGHT_PROMPT.format(
dataset=dataset_info["description"],
metadata=self.metadata_builder(dataset_info["metadata"]),
head=dataset_info["df_head"],
)
if type(dataset_info) == dict:
description_prompt = DATASET_DESCRIPTION_SELA_PROMPT.format(
dataset=dataset_info["description"],
metadata=self.metadata_builder(dataset_info["metadata"]),
head=dataset_info["df_head"],
)
else:
description_prompt = DATASET_DESCRIPTION_CUSTOM_PROMPT.format(dataset_description=dataset_info)
context = DATASET_INSIGHT_PROMPT.format(description=description_prompt)
rsp = await llm.aask(context)
rsp = clean_json_from_rsp(rsp)
analysis_pool = self.process_analysis_pool(json.loads(rsp))
dataset_path = f"{self.data_dir}/{dataset_name}"
self.save_analysis_pool(dataset_path, analysis_pool)
if save_analysis_pool:
dataset_path = f"{self.data_dir}/{dataset_name}"
self.save_analysis_pool(dataset_path, analysis_pool)
return analysis_pool
async def propose_new_insights(self, solution, score):
llm = LLM()
context = INSIGHT_PROPOSAL_PROMPT.format(solution_code=solution, dev_score=score)
rsp = await llm.aask(context)
rsp = clean_json_from_rsp(rsp)
new_insights = self.process_analysis_pool(json.loads(rsp))
return new_insights
def process_analysis_pool(self, insights_rsp):
analysis_pool = []

View file

@ -13,15 +13,19 @@ from metagpt.roles.di.data_interpreter import DataInterpreter
from metagpt.schema import Message, Task, TaskResult
from metagpt.utils.common import CodeParser, write_json_file
EXTRACT_SCORE_PROMPT = """
# Code:
CODE_BLOCK_RESULT = """
## Code:
{code}
# Execution Result:
## Execution Result:
{result}
"""
EXTRACT_SCORE_PROMPT = """
# Code Blocks
{code_block}
# Instruction:
Based on the code and execution result, please extract the scores and return it as a dictionary.
Based on the code and execution result, please extract the **final scores** and return it as a dictionary.
If you cannot find the scores, please still return a dictionary with the keys 'train_score', 'dev_score', and 'test_score', and set the values to -1.
# Format:
@ -109,9 +113,17 @@ class ResearchAssistant(DataInterpreter):
return score_dict
async def llm_extract_score(self):
result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code=code_text, result=result_text, role="user"))
# result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result
# code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code
num_tasks = len(self.planner.plan.task_map)
task_map = self.planner.plan.task_map
code_block = "\n".join(
[
CODE_BLOCK_RESULT.format(code=task_map[str(i + 1)].code, result=task_map[str(i + 1)].result)
for i in range(num_tasks)
]
)
rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code_block=code_block, role="user"))
json_block = CodeParser.parse_code(block=None, text=rsp)
score_dict = json.loads(json_block)
return score_dict
@ -139,6 +151,11 @@ class ResearchAssistant(DataInterpreter):
save_notebook(role=self, save_dir=self.role_dir, name=self.get_node_name())
return task_result
def get_solution(self):
codes = [task.code for task in self.planner.plan.tasks]
results = [task.result for task in self.planner.plan.tasks]
return {"codes": codes, "results": results}
def save_state(self, static_save=False):
"""
attribute:
@ -156,7 +173,7 @@ class ResearchAssistant(DataInterpreter):
stg_path = self.role_dir
name = self.get_node_name()
role_path = os.path.join(stg_path, f"{name}.json")
# 将状态保存为 JSON 文件
# save state as json file
write_json_file(role_path, self.model_dump())
def remap_tasks(self):

View file

@ -1,6 +1,7 @@
import argparse
import asyncio
from expo.data.custom_task import get_mle_is_lower_better, get_mle_task_id
from expo.experimenter.aug import AugExperimenter
from expo.experimenter.autogluon import GluonExperimenter
from expo.experimenter.autosklearn import AutoSklearnExperimenter
@ -9,7 +10,7 @@ from expo.experimenter.experimenter import Experimenter
from expo.experimenter.mcts import MCTSExperimenter
def get_args():
def get_args(cmd=True):
parser = argparse.ArgumentParser()
parser.add_argument("--name", type=str, default="")
parser.add_argument(
@ -22,7 +23,18 @@ def get_args():
get_di_args(parser)
get_mcts_args(parser)
get_aug_exp_args(parser)
return parser.parse_args()
if cmd:
args = parser.parse_args()
else:
args = parser.parse_args("")
if args.custom_dataset_dir:
args.external_eval = False
args.eval_func = "mlebench"
args.from_scratch = True
args.task = get_mle_task_id(args.custom_dataset_dir)
args.low_is_better = get_mle_is_lower_better(args.task)
return args
def get_mcts_args(parser):
@ -31,7 +43,17 @@ def get_mcts_args(parser):
parser.set_defaults(load_tree=False)
parser.add_argument("--rollouts", type=int, default=5)
parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")
parser.set_defaults(use_fixed_insights=False)
parser.add_argument("--start_task_id", type=int, default=2)
parser.add_argument(
"--from_scratch", dest="from_scratch", action="store_true", help="Generate solutions from scratch"
)
parser.set_defaults(from_scratch=False)
parser.add_argument("--no_external_eval", dest="external_eval", action="store_false")
parser.set_defaults(external_eval=True)
parser.add_argument("--eval_func", type=str, default="sela", choices=["sela", "mlebench"])
parser.add_argument("--custom_dataset_dir", type=str, default=None)
parser.add_argument("--max_depth", type=int, default=4)
def get_aug_exp_args(parser):

View file

@ -0,0 +1,23 @@
import networkx as nx
from expo.evaluation.visualize_mcts import build_tree_recursive, visualize_tree
from expo.MCTS import MCTS, create_initial_state, initialize_di_root_node
from expo.run_experiment import get_args
from expo.utils import DATA_CONFIG
if __name__ == "__main__":
args = get_args()
data_config = DATA_CONFIG
state = create_initial_state(args.task, 0, data_config, args=args)
role, node = initialize_di_root_node(state)
mcts = MCTS(
root_node=node,
max_depth=5,
use_fixed_insights=False,
)
mcts.load_tree()
root = mcts.root_node
G = nx.DiGraph()
build_tree_recursive(G, "0", root)
visualize_tree(G, save_path="results/tree.png")

View file

@ -51,6 +51,8 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"):
f"Dataset {task_name} not found in config file. Available datasets: {data_config['datasets'].keys()}"
)
exp_pool_path = os.path.join(data_path, f"{pool_name}.json")
if not os.path.exists(exp_pool_path):
return None
return exp_pool_path
@ -109,7 +111,7 @@ async def load_execute_notebook(role):
codes = [task.code for task in tasks if task.code]
executor = role.execute_code
executor.nb = nbformat.v4.new_notebook()
executor.nb_client = NotebookClient(executor.nb, timeout=executor.timeout)
executor.nb_client = NotebookClient(executor.nb, timeout=role.role_timeout)
# await executor.build()
for code in codes:
outputs, success = await executor.run(code)