diff --git a/expo/MCTS.py b/expo/MCTS.py index 9d778e4ed..2ce559ae0 100644 --- a/expo/MCTS.py +++ b/expo/MCTS.py @@ -3,10 +3,12 @@ import math import os import pickle import random +import shutil import numpy as np import pandas as pd +from expo.data.custom_task import get_mle_bench_requirements, get_mle_task_id from expo.data.dataset import generate_task_requirement, get_split_dataset_path from expo.evaluation.evaluation import evaluate_score from expo.insights.instruction_generator import InstructionGenerator @@ -17,32 +19,52 @@ from metagpt.utils.common import read_json_file def initialize_di_root_node(state, reflection: bool = True): - # state = create_initial_state( - # task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name - # ) role = ResearchAssistant( - node_id="0", start_task_id=state["start_task_id"], use_reflection=reflection, role_dir=state["node_dir"] + node_id="0", + start_task_id=state["start_task_id"], + use_reflection=reflection, + role_dir=state["node_dir"], + role_timeout=state["role_timeout"], ) return role, Node(parent=None, state=state, action=None, value=0) -def create_initial_state( - task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str, args -): +def create_initial_state(task, start_task_id, data_config, args): + external_eval = args.external_eval + + if args.custom_dataset_dir: + dataset_config = None + datasets_dir = args.custom_dataset_dir + requirement = get_mle_bench_requirements( + args.custom_dataset_dir, data_config, special_instruction=args.special_instruction + ) + exp_pool_path = None + # external_eval = False # make sure external eval is false if custom dataset is used + task = get_mle_task_id(args.custom_dataset_dir) + else: + dataset_config = data_config["datasets"][task] + if dataset_config["metric"] == "rmse": + args.low_is_better = True + datasets_dir = get_split_dataset_path(task, data_config) + requirement = generate_task_requirement( + task, data_config, is_di=True, special_instruction=args.special_instruction + ) + exp_pool_path = get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool") + initial_state = { "task": task, "work_dir": data_config["work_dir"], - "node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{name}"), - "dataset_config": data_config["datasets"][task], - "datasets_dir": get_split_dataset_path(task, data_config), - "exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"), - "requirement": generate_task_requirement( - task, data_config, is_di=True, special_instruction=special_instruction - ), + "node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{args.name}"), + "dataset_config": dataset_config, + "datasets_dir": datasets_dir, # won't be used if external eval is used + "exp_pool_path": exp_pool_path, + "requirement": requirement, "has_run": False, "start_task_id": start_task_id, - "low_is_better": low_is_better, + "low_is_better": args.low_is_better, "role_timeout": args.role_timeout, + "external_eval": external_eval, + "custom_dataset_dir": args.custom_dataset_dir, } os.makedirs(initial_state["node_dir"], exist_ok=True) return initial_state @@ -108,7 +130,7 @@ class Node: return f"{self.parent.id}-{num_sibling}" def is_terminal(self): - return int(self.state["start_task_id"]) == self.max_depth + 1 + return int(self.state["start_task_id"]) == self.max_depth + 1 # TODO: Check if this is correct or +1 def is_fully_expanded(self): return len(self.children) > 0 @@ -173,22 +195,34 @@ class Node: node.save_new_role(new_role) self.add_child(node) - def evaluate_prediction(self, split): - pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv") - pred_node_path = os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv") + def get_predictions_path(self, split): + return os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv") + + def get_and_move_predictions(self, split): + if not os.path.exists(self.get_predictions_path(split)): + pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv") + shutil.copy(pred_path, self.get_predictions_path(split)) + os.remove(pred_path) + return pd.read_csv(self.get_predictions_path(split)) + + def get_gt(self, split): gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"]) - preds = pd.read_csv(pred_path)["target"] - preds.to_csv(pred_node_path, index=False) - gt = pd.read_csv(gt_path)["target"] + return pd.read_csv(gt_path) + + def evaluate_prediction(self, split): + preds = self.get_and_move_predictions(split)["target"] + gt = self.get_gt(split)["target"] metric = self.state["dataset_config"]["metric"] - # remove original predictions.csv - os.remove(pred_path) return evaluate_score(preds, gt, metric) def evaluate_simulation(self, score_dict): - scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")} - scores["score"] = scores["dev_score"] - score_dict.update(scores) + if self.state["external_eval"]: # use external evaluation + scores = {"dev_score": self.evaluate_prediction("dev"), "test_score": self.evaluate_prediction("test")} + scores["score"] = scores["dev_score"] + score_dict.update(scores) + else: + self.get_and_move_predictions("dev") + self.get_and_move_predictions("test") return score_dict async def run_node(self, role=None): @@ -215,7 +249,6 @@ class Node: mcts_logger.log("MCTS", f"Role-level timeout: {e}") break except Exception as e: - print(f"Error: {e}") mcts_logger.log("MCTS", f"Error in running the role: {e}") num_runs += 1 @@ -235,14 +268,15 @@ class Node: score_dict = {k: normalize_score(v) for k, v in score_dict.items()} self.normalized_reward = score_dict - return score_dict + result_dict = role.get_solution() + return score_dict, result_dict class MCTS: # data_path root_node: Node = None children: dict = {} - max_depth: int = 5 + max_depth: int = None c_explore: float = 1.4 c_unvisited: float = 0.8 node_order: list = [] @@ -281,9 +315,9 @@ class MCTS: mcts_logger.log("MCTS", f"Start simulating node {node.id}:") while node.children: node = random.choice(node.children) - reward = await node.run_node(role) + reward, result_dict = await node.run_node(role) mcts_logger.log("MCTS", f"Simulated node's reward: {reward}") - + # TODO: add new insights return reward def backpropagate(self, node: Node, reward): @@ -341,12 +375,17 @@ class MCTS: scores["test_raw"].append(node.raw_reward["test_score"]) return scores - async def search(self, state, rollouts, load_tree=False, reflection=False): + async def search(self, state, args): + reflection = args.reflection + load_tree = args.load_tree + rollouts = args.rollouts + from_scratch = args.from_scratch role, root = initialize_di_root_node(state, reflection=reflection) self.root_node = root self.instruction_generator = InstructionGenerator( - file_path=state["exp_pool_path"], use_fixed_insights=self.use_fixed_insights + state=state, use_fixed_insights=self.use_fixed_insights, from_scratch=from_scratch ) + await self.instruction_generator.initialize() tree_loaded = False if load_tree: diff --git a/expo/README.md b/expo/README.md index a25f384b6..800afc3cc 100644 --- a/expo/README.md +++ b/expo/README.md @@ -6,7 +6,12 @@ # SELA: Tree-Search Enhanced LLM Agents for Automated Machine Learning ## 1. Data Preparation - Download Datasets:https://deepwisdom.feishu.cn/drive/folder/RVyofv9cvlvtxKdddt2cyn3BnTc?from=from_copylink - +- Download and prepare datasets from scratch: + ``` + cd expo/data + python dataset.py --save_analysis_pool + python hf_data.py --save_analysis_pool + ``` ## 2. Configs @@ -60,7 +65,7 @@ #### Run If the dataset has reg metric, remember to use `--low_is_better`: -- `python run_experiment.py --exp_mode mcts --task house_prices --rollouts 10 --low_is_better` +- `python run_experiment.py --exp_mode mcts --task house-prices --rollouts 10 --low_is_better` In addition to the generated insights, include the fixed insights saved in `expo/insights/fixed_insights.json` @@ -85,6 +90,23 @@ ## 4. Evaluation - Use the function `evaluate_score` to evaluate. +#### MLE-Bench +**Note: mle-bench requires python 3.11 or higher** +``` +git clone https://github.com/openai/mle-bench.git +cd mle-bench +pip install -e . +``` + +``` +mlebench prepare -c --data-dir +``` + +Enter the following command to run the experiment: +``` +python run_experiment.py --exp_mode mcts --custom_dataset_dir --rollouts 10 --from_scratch --role_timeout 3600 +``` + ## 5. Baselines ### DS Agent @@ -92,7 +114,7 @@ ### DS Agent git clone https://github.com/guosyjlu/DS-Agent.git ``` -将其deployment/generate.py line46-48行部分修改如下(目的是用deepseek而非GPT的API): +Modify the following lines in deployment/generate.py (lines 46-48) as shown below (the purpose is to use deepseek instead of OpenAI's API): ```python messages = [{"role": "user", "content": prompt}] @@ -120,7 +142,7 @@ ### DS Agent completion = raw_completion.split("```python")[1].split("```")[0] ``` -修改完后在新建一个`deployment/test.sh` 分别运行下列两行,`$TASK` 是你要测试的task name +After making the changes, create a new `deployment/test.sh` and run the following two lines separately, where `$TASK` is the name of the task you want to test ``` python -u generate.py --llm deepseek-coder --task $TASK --shot 1 --retrieval > "$TASK".txt 2>&1 @@ -135,7 +157,7 @@ #### Setup git clone https://github.com/WecoAI/aideml.git ``` -修改 `aideml/aide/utils/config.yaml` 其中的 `step` `k_fold_validation` `code model` `feedback model` 参数如下 +Modify `aideml/aide/utils/config.yaml` - change `k_fold_validation`, `code model`, and `feedback model` as follows: ```yaml # agent hyperparams @@ -153,16 +175,22 @@ # LLM settings for evaluating program output / tracebacks feedback: model: deepseek-coder temp: 0.5 + + # hyperparameters for the tree search + search: + max_debug_depth: 3 + debug_prob: 0.5 + num_drafts: 5 ``` -由于 deepseek 完全兼容 OpenAI 的 API,修改`base_url`为`自己的url`,`api_key`为`自己的key`即可 +Since Deepseek is compatible to OpenAI's API, change `base_url` into `your own url`,`api_key` into `your api key` ``` -export OPENAI_API_KEY="自己的key" -export OPENAI_BASE_URL="自己的url" +export OPENAI_API_KEY="your api key" +export OPENAI_BASE_URL="your own url" ``` -修改`aideml/aide/backend/__init__.py` 30 行内容如下: +Modify `aideml/aide/backend/__init__.py`'s line 30 and below: ```python model_kwargs = model_kwargs | { @@ -176,7 +204,7 @@ # LLM settings for evaluating program output / tracebacks query_func = backend_openai.query ``` -由于 deepseekV2.5 不再支持 system message 使用 function call,修改 `aideml/aide/agent.py` 312 行内容如下: +Since deepseekV2.5 no longer supports system message using function call, modify `aideml/aide/agent.py`'s line 312: ```python response = cast( @@ -191,7 +219,7 @@ # LLM settings for evaluating program output / tracebacks ) ``` -修改完后 +Modify and install: ``` cd aideml @@ -200,8 +228,8 @@ # LLM settings for evaluating program output / tracebacks #### Run -运行下面脚本获取运行结果,在当前目录下将生成一个 log 文件夹以及 workspace 文件夹 -log 文件夹中将包含实验使用配置以及生成方案记录,workspace 文件夹下将保存 aide 最后生成的结果文件 +Run the following script to get the running results, a `log` folder and a `workspace` folder will be generated in the current directory +The `log` folder will contain the experimental configuration and the generated scheme, and the `workspace` folder will save the final results generated by aide ``` python experimenter/aide.py @@ -227,7 +255,6 @@ #### Setup Replace {task_name} with the specific task you want to run. -提供github链接,并说明使用的命令以及参数设置 ### AutoSklearn #### System requirements auto-sklearn has the following system requirements: @@ -258,15 +285,4 @@ ### Base DI For setup, check 4. - `python run_experiment.py --exp_mode base --task titanic --num_experiments 10` - Specifically instruct DI to use AutoGluon: `--special_instruction ag` -- Specifically instruct DI to use the stacking ensemble method: `--special_instruction stacking` - - - - - - - - - - - +- Specifically instruct DI to use the stacking ensemble method: `--special_instruction stacking` \ No newline at end of file diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py new file mode 100644 index 000000000..f3cd433f5 --- /dev/null +++ b/expo/data/custom_task.py @@ -0,0 +1,74 @@ +import os + +from expo.data.dataset import SPECIAL_INSTRUCTIONS +from expo.experimenter.mle_bench.instructions import ( + ADDITIONAL_NOTES, + INSTRUCTIONS, + INSTRUCTIONS_OBFUSCATED, +) + +MLE_BENCH_FILES = ["description.md", "description_obfuscated.md"] + + +MLE_REQUIREMENTS = """ +{instructions} + +{additonal_notes} + +COMPETITION INSTRUCTIONS +------ + +{task_description} + +## More Instructions +- output_dir: {output_dir} +- Besides `submission.csv`, you should also save your output in the output directory. +- You should split the training data into train and dev set. +- You should use the dev set to improve your model. Print the final dev set score after training. +- Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. They should be in the same format as the `submission.csv`. +- Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. {special_instruction} +**Do not make any plots or visualizations.** +""" + + +def get_mle_task_id(dataset_dir): + return dataset_dir.split("/")[-3] + + +def get_mle_is_lower_better(task): + from mlebench.data import get_leaderboard + from mlebench.registry import registry + + competition = registry.get_competition(task) + competition_leaderboard = get_leaderboard(competition) + return competition.grader.is_lower_better(competition_leaderboard) + + +def get_mle_bench_requirements(dataset_dir, data_config, special_instruction, obfuscated=False): + work_dir = data_config["work_dir"] + task = get_mle_task_id(dataset_dir) + output_dir = f"{work_dir}/{task}" + final_output_dir = f"{work_dir}/submission" + os.makedirs(output_dir, exist_ok=True) + if special_instruction: + special_instruction = SPECIAL_INSTRUCTIONS[special_instruction] + else: + special_instruction = "" + if obfuscated: + instructions = INSTRUCTIONS_OBFUSCATED.format(dataset_dir=dataset_dir, output_dir=final_output_dir) + task_file = "description_obfuscated.md" + else: + instructions = INSTRUCTIONS.format(dataset_dir=dataset_dir, output_dir=output_dir) + task_file = "description.md" + + with open(os.path.join(dataset_dir, task_file), encoding="utf-8") as f: + task_description = f.read() + mle_requirement = MLE_REQUIREMENTS.format( + instructions=instructions, + additonal_notes=ADDITIONAL_NOTES, + task_description=task_description, + output_dir=output_dir, + special_instruction=special_instruction, + ) + print(mle_requirement) + return mle_requirement diff --git a/expo/data/dataset.py b/expo/data/dataset.py index e076284d6..91490dcd7 100644 --- a/expo/data/dataset.py +++ b/expo/data/dataset.py @@ -1,3 +1,4 @@ +import argparse import asyncio import json import os @@ -18,22 +19,22 @@ Report {metric} on the eval data. Do not plot or make any visualizations. """ USE_AG = """ -7. Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric. +- Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric. """ TEXT_MODALITY = """ -7. You could use models from transformers library for this text dataset. -8. Use gpu if available for faster training. +- You could use models from transformers library for this text dataset. +- Use gpu if available for faster training. """ IMAGE_MODALITY = """ -7. You could use models from transformers/torchvision library for this image dataset. -8. Use gpu if available for faster training. +- You could use models from transformers/torchvision library for this image dataset. +- Use gpu if available for faster training. """ STACKING = """ -7. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor. -8. You could do some quick model prototyping to see which models work best and then use them in the ensemble. +- To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor. +- You could do some quick model prototyping to see which models work best and then use them in the ensemble. """ @@ -268,7 +269,7 @@ class ExpDataset: dataset_info = self.get_dataset_info() num_classes = dataset_info["metadata"]["NumberOfClasses"] if num_classes == 2: - metric = "f1" + metric = "f1 binary" elif 2 < num_classes <= 200: metric = "f1 weighted" elif num_classes > 200 or num_classes == 0: @@ -361,10 +362,22 @@ async def process_dataset(dataset, solution_designer: SolutionDesigner, save_ana datasets_dict["datasets"][dataset.name] = dataset_dict +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--force_update", action="store_true", help="Force update datasets") + parser.add_argument("--save_analysis_pool", action="store_true", help="Save analysis pool") + parser.add_argument( + "--no_save_analysis_pool", dest="save_analysis_pool", action="store_false", help="Do not save analysis pool" + ) + parser.set_defaults(save_analysis_pool=True) + return parser.parse_args() + + if __name__ == "__main__": datasets_dir = DATA_CONFIG["datasets_dir"] - force_update = False - save_analysis_pool = True + args = parse_args() + force_update = args.force_update + save_analysis_pool = args.save_analysis_pool datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() for dataset_id in OPENML_DATASET_IDS: diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py index 133fbdfa6..a18517d49 100644 --- a/expo/data/hf_data.py +++ b/expo/data/hf_data.py @@ -7,7 +7,12 @@ import pandas as pd from datasets import load_dataset from PIL import Image -from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml +from expo.data.dataset import ( + ExpDataset, + parse_args, + process_dataset, + save_datasets_dict_to_yaml, +) from expo.insights.solution_designer import SolutionDesigner from expo.utils import DATA_CONFIG @@ -116,8 +121,9 @@ class HFExpDataset(ExpDataset): if __name__ == "__main__": dataset_dir = DATA_CONFIG["datasets_dir"] - save_analysis_pool = True - force_update = False + args = parse_args() + force_update = args.force_update + save_analysis_pool = args.save_analysis_pool datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() for dataset_meta in HFDATSETS: diff --git a/expo/evaluation/evaluation.py b/expo/evaluation/evaluation.py index 16b3acb71..1e58e1725 100644 --- a/expo/evaluation/evaluation.py +++ b/expo/evaluation/evaluation.py @@ -1,3 +1,5 @@ +from pathlib import Path + import numpy as np from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, roc_auc_score @@ -22,3 +24,26 @@ def evaluate_score(pred, gt, metric): return mean_squared_error(np.log1p(gt), np.log1p(pred), squared=False) else: raise ValueError(f"Metric {metric} not supported") + + +def node_evaluate_score_sela(node): + preds = node.get_and_move_predictions("test")["target"] + gt = node.get_gt("test")["target"] + metric = node.state["dataset_config"]["metric"] + return evaluate_score(preds, gt, metric) + + +def node_evaluate_score_mlebench(node): + # TODO + from mlebench.grade import grade_csv + from mlebench.registry import registry + + competition_id = node.state["task"] + data_dir = Path(node.state["custom_dataset_dir"]).parent.parent.parent # prepared/public/../../../ + pred_path = node.get_predictions_path("test") + new_registry = registry.set_data_dir(data_dir) + competition = new_registry.get_competition(competition_id) + submission = Path(pred_path) + report = grade_csv(submission, competition).to_dict() + report["submission_path"] = str(submission) + return report diff --git a/expo/evaluation/visualize_mcts.py b/expo/evaluation/visualize_mcts.py index d310036c0..6a8869670 100644 --- a/expo/evaluation/visualize_mcts.py +++ b/expo/evaluation/visualize_mcts.py @@ -1,5 +1,8 @@ import textwrap +import matplotlib.pyplot as plt +import networkx as nx + from expo.MCTS import Node NODE_TEMPLATE = """\ @@ -11,6 +14,9 @@ Score: {score}, Visits: {num_visits} """ +NODE_SIZE = 12000 +NODE_FONT_SIZE = 18 + def get_role_plans(role): plans = role.planner.plan.tasks @@ -42,7 +48,7 @@ def get_tree_text(node: Node): id=node_id, plans=instruct_plans_text, simulated=simulated, score=score, num_visits=num_visits ) - def visualize_tree(node, depth=0, previous_plans=None): + def visualize_tree_text(node, depth=0, previous_plans=None): text = "" if node is not None: text += visualize_node(node, previous_plans) @@ -50,10 +56,107 @@ def get_tree_text(node: Node): code_set.update({task.instruction for task in role.planner.plan.tasks}) previous_plans = get_role_plans(role) for child in node.children: - text += textwrap.indent(visualize_tree(child, depth + 1, previous_plans), "\t") + text += textwrap.indent(visualize_tree_text(child, depth + 1, previous_plans), "\t") return text num_simulations = node.visited text = f"Number of simulations: {num_simulations}\n" - text += visualize_tree(node) + text += visualize_tree_text(node) return text, len(code_set) + + +def get_node_color(node): + if node["visits"] == 0: + return "#D3D3D3" + else: + # The higher the avg_value, the more intense the color + # avg_value is between 0 and 1 + avg_value = node["avg_value"] + # Convert avg_value to a color ranging from red (low) to green (high) + red = int(255 * (1 - avg_value)) + green = int(255 * avg_value) + return f"#{red:02X}{green:02X}00" + + +def visualize_tree(graph, show_instructions=False, save_path=""): + # Use a hierarchical layout for tree-like visualization + pos = nx.spring_layout(graph, k=0.9, iterations=50) + + plt.figure(figsize=(30, 20)) # Further increase figure size for better visibility + + # Calculate node levels + root = "0" + levels = nx.single_source_shortest_path_length(graph, root) + max_level = max(levels.values()) + + # Adjust y-coordinates based on levels and x-coordinates to prevent overlap + nodes_by_level = {} + for node, level in levels.items(): + if level not in nodes_by_level: + nodes_by_level[level] = [] + nodes_by_level[level].append(node) + + for level, nodes in nodes_by_level.items(): + y = 1 - level / max_level + x_step = 1.0 / (len(nodes) + 1) + for i, node in enumerate(sorted(nodes)): + pos[node] = ((i + 1) * x_step, y) + + # Draw edges + nx.draw_networkx_edges(graph, pos, edge_color="gray", arrows=True, arrowsize=40, width=3) + + # Draw nodes + node_colors = [get_node_color(graph.nodes[node]) for node in graph.nodes] + nx.draw_networkx_nodes(graph, pos, node_size=NODE_SIZE, node_color=node_colors) + + # Add labels to nodes + labels = nx.get_node_attributes(graph, "label") + nx.draw_networkx_labels(graph, pos, labels, font_size=NODE_FONT_SIZE) + + if show_instructions: + # Add instructions to the right side of nodes + instructions = nx.get_node_attributes(graph, "instruction") + for node, (x, y) in pos.items(): + wrapped_text = textwrap.fill(instructions[node], width=30) # Adjust width as needed + plt.text(x + 0.05, y, wrapped_text, fontsize=15, ha="left", va="center") + + plt.title("MCTS Tree Visualization", fontsize=40) + plt.axis("off") # Turn off axis + plt.tight_layout() + if save_path: + plt.savefig(save_path) + plt.show() + + +def build_tree_recursive(graph, parent_id, node, start_task_id=2): + """ + Recursively builds the entire tree starting from the root node. + Adds nodes and edges to the NetworkX graph. + """ + role = node.load_role() + depth = node.get_depth() + if depth == 0: + instruction = "\n\n".join([role.planner.plan.tasks[i].instruction for i in range(start_task_id)]) + else: + instruction = role.planner.plan.tasks[depth + start_task_id - 1].instruction + print(instruction) + # Add the current node with attributes to the graph + dev_score = node.raw_reward.get("dev_score", 0) * 100 + avg_score = node.avg_value() * 100 + graph.add_node( + parent_id, + label=f"{node.id}\nAvg: {avg_score:.1f}\nScore: {dev_score:.1f}\nVisits: {node.visited}", + avg_value=node.avg_value(), + dev_score=dev_score, + visits=node.visited, + instruction=instruction, + ) + # Stopping condition: if the node has no children, return + if not node.children: + return + + # Recursively create all child nodes + for i, child in enumerate(node.children): + child_id = f"{parent_id}-{i}" + graph.add_edge(parent_id, child_id) + build_tree_recursive(graph, child_id, child) diff --git a/expo/experimenter/custom.py b/expo/experimenter/custom.py index 92b7dafa2..f245499ca 100644 --- a/expo/experimenter/custom.py +++ b/expo/experimenter/custom.py @@ -21,9 +21,7 @@ class CustomExperimenter(Experimenter): self.task, start_task_id=1, data_config=self.data_config, - low_is_better=self.low_is_better, - name=self.name, - special_instruction=self.args.special_instruction, + args=self.args, ) def run_experiment(self): diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py index 9aa879e24..4a0b8413e 100644 --- a/expo/experimenter/experimenter.py +++ b/expo/experimenter/experimenter.py @@ -24,9 +24,6 @@ class Experimenter: self.args.task, start_task_id=self.start_task_id, data_config=self.data_config, - low_is_better=self.args.low_is_better, - name=self.args.name, - special_instruction=self.args.special_instruction, args=self.args, ) @@ -43,7 +40,10 @@ class Experimenter: except Exception as e: print(f"Error: {e}") num_runs += 1 - save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}") + # save_notebook(role=di, save_dir=self.result_path, name=f"{self.args.task}_{self.start_time}_{run_idx}") + save_name = self.get_save_name() + save_notebook(role=di, save_dir=self.result_path, name=f"{save_name}_{run_idx}") + if not run_finished: score_dict = {"train_score": -1, "dev_score": -1, "test_score": -1, "score": -1} return score_dict diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py index c063268c8..a42566366 100644 --- a/expo/experimenter/mcts.py +++ b/expo/experimenter/mcts.py @@ -1,5 +1,9 @@ import shutil +from expo.evaluation.evaluation import ( + node_evaluate_score_mlebench, + node_evaluate_score_sela, +) from expo.evaluation.visualize_mcts import get_tree_text from expo.experimenter.experimenter import Experimenter from expo.Greedy import Greedy, Random @@ -14,30 +18,35 @@ class MCTSExperimenter(Experimenter): self.start_task_id = 1 # start from datapreprocessing if it is image task else: self.start_task_id = args.start_task_id + + if args.eval_func == "sela": + self.eval_func = node_evaluate_score_sela + elif args.eval_func == "mlebench": + self.eval_func = node_evaluate_score_mlebench + super().__init__(args, **kwargs) self.tree_mode = tree_mode async def run_experiment(self): + use_fixed_insights = self.args.use_fixed_insights + depth = self.args.max_depth if self.tree_mode == "greedy": - mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights) + mcts = Greedy(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights) elif self.tree_mode == "random": - mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights) + mcts = Random(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights) else: - mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights) - best_nodes = await mcts.search( - state=self.state, - reflection=self.args.reflection, - rollouts=self.args.rollouts, - load_tree=self.args.load_tree, - ) + mcts = MCTS(root_node=None, max_depth=depth, use_fixed_insights=use_fixed_insights) + best_nodes = await mcts.search(state=self.state, args=self.args) best_node = best_nodes["global_best"] dev_best_node = best_nodes["dev_best"] score_dict = best_nodes["scores"] + additional_scores = {"grader": self.eval_func(dev_best_node)} text, num_generated_codes = get_tree_text(mcts.root_node) text += f"Generated {num_generated_codes} unique codes.\n" text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n" text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n" + text += f"Grader score: {additional_scores['grader']}\n" print(text) results = [ { @@ -50,6 +59,7 @@ class MCTSExperimenter(Experimenter): "tree_text": text, "args": vars(self.args), "scores": score_dict, + "additional_scores": additional_scores, } ] self.save_result(results) diff --git a/expo/experimenter/mle_bench/instructions.py b/expo/experimenter/mle_bench/instructions.py new file mode 100644 index 000000000..631f410dc --- /dev/null +++ b/expo/experimenter/mle_bench/instructions.py @@ -0,0 +1,48 @@ +SUBMISSION_INSTURCTION = """ +- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server. +""" +INSTRUCTIONS = """ +You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself. + +You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file. + +BENCHMARK INSTRUCTIONS +------ +First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `expo.experimenter.mle_bench.instructions.py`. + +- **Competition-specific instructions**: The instructions for the competition are available in `{dataset_dir}/description.md`. Read this file to understand the problem setup for the competition you are currently solving. +- **Dataset**: The dataset files for the competition are available in `{dataset_dir}`. +- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `{output_dir}/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt. +- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `{dataset_dir}` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you. +- **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified. + +Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task. + +Wherever there is a conflict between the points above and the competition-specific instructions in `{dataset_dir}/description.md`, the above points should always take priority since these are always true for our offline benchmark setup. +""" + +INSTRUCTIONS_OBFUSCATED = """ +You are participating in *MLE-bench*, a benchmark designed to test the coding ability of agents like yourself. + +You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file. + +BENCHMARK INSTRUCTIONS +------ +First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `expo.experimenter.mle_bench.instructions.py`. + +- **Task-specific instructions**: The instructions for the task are available in `{dataset_dir}/description.md`. Read this file to understand the problem setup for the task you are currently solving. +- **Dataset**: The dataset files for the task are available in `{dataset_dir}/`. +- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `{output_dir}/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt. +- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `{dataset_dir}` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you. +- **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified. + +Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task. + +Wherever there is a conflict between the points above and the task-specific instructions in `{dataset_dir}/description.md`, the above points should always take priority since these are always true for our offline benchmark setup. +""" + +ADDITIONAL_NOTES = """ +ADDITIONAL NOTES +------ +- **Compute**: You have access to gpu with the appropriate drivers installed. +""" diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py index 330795730..78b32e45d 100644 --- a/expo/insights/instruction_generator.py +++ b/expo/insights/instruction_generator.py @@ -1,6 +1,7 @@ import json import os import random +from difflib import SequenceMatcher from expo.insights.solution_designer import SolutionDesigner from expo.utils import clean_json_from_rsp, load_data_config, mcts_logger @@ -33,11 +34,30 @@ DATA_CONFIG = load_data_config() class InstructionGenerator: data_config = DATA_CONFIG - def __init__(self, file_path, use_fixed_insights=False): - self.file_path = file_path + def __init__(self, state, use_fixed_insights, from_scratch): + self.state = state + self.file_path = state["exp_pool_path"] + if state["custom_dataset_dir"]: + with open(f"{state['custom_dataset_dir']}/description.md", "r", encoding="utf-8") as file: + self.dataset_info = file.read() + else: + dataset_info_path = ( + f"{self.data_config['datasets_dir']}/{state['dataset_config']['dataset']}/dataset_info.json" + ) + with open(dataset_info_path, "r") as file: + self.dataset_info = json.load(file) self.use_fixed_insights = use_fixed_insights - self.analysis_pool = self.load_insight_pool(file_path, use_fixed_insights) self.proposer = SolutionDesigner() + if self.file_path is None: + self.from_scratch = True + else: + self.from_scratch = from_scratch + + async def initialize(self): + if self.from_scratch: + self.insight_pool = await self.generate_solutions_from_scratch(self.dataset_info, self.state["task"]) + else: + self.insight_pool = self.load_insight_pool(self.file_path, self.use_fixed_insights) @staticmethod def load_json_data(json_dir): @@ -84,14 +104,14 @@ class InstructionGenerator: data.extend(fixed_insights) for item in data: if "task_id" not in item: - raise ValueError("task_id is not found in the analysis pool") + raise ValueError("task_id is not found in the insight_pool") if task_id: data = [item for item in data if int(item["task_id"]) == int(task_id)] return data async def generate_new_instructions(self, task_id, original_instruction, max_num, ext_info=None): - data = self.analysis_pool + data = self.insight_pool new_instructions = [] if len(data) == 0: mcts_logger.log("MCTS", f"No insights available for task {task_id}") @@ -108,6 +128,34 @@ class InstructionGenerator: new_instructions.append(new_instruction) return new_instructions + async def propose_new_insights(self, solution, score): + new_insights = await self.proposer.propose_insights(solution, score) + added_insights = self.add_insight(new_insights) + return added_insights + + async def generate_solutions_from_scratch(self, dataset_info, dataset_name): + insight_pool = await self.proposer.generate_solutions(dataset_info, dataset_name, save_analysis_pool=False) + return insight_pool + + def add_insight(self, new_insights): + added_insights = [] + for new_insight in new_insights: + if not self.is_similar_to_existing(new_insight): + added_insights.append(new_insight) + self.insight_pool.append(new_insight) + return added_insights + + def is_similar_to_existing(self, new_insight, similarity_threshold=0.8): + for existing_insight in self.insight_pool: + similarity = self.calculate_similarity(new_insight["Analysis"], existing_insight["Analysis"]) + if similarity > similarity_threshold: + return True + return False + + @staticmethod + def calculate_similarity(text1, text2): + return SequenceMatcher(None, text1, text2).ratio() + @staticmethod async def generate_new_instruction(original_instruction, insights, ext_info): prompt = CHANGE_INSTRUCTION.format(instruction=original_instruction, insights=insights) diff --git a/expo/insights/solution_designer.py b/expo/insights/solution_designer.py index 9968131ca..262caa0f6 100644 --- a/expo/insights/solution_designer.py +++ b/expo/insights/solution_designer.py @@ -5,7 +5,8 @@ from metagpt.llm import LLM DATA_CONFIG = load_data_config() -DATASET_INSIGHT_PROMPT = """ + +DATASET_DESCRIPTION_SELA_PROMPT = """ # Dataset Description {dataset} @@ -14,6 +15,15 @@ DATASET_INSIGHT_PROMPT = """ # Dataset Head {head} +""" + +DATASET_DESCRIPTION_CUSTOM_PROMPT = """ +# Dataset Description +{dataset_description} +""" + +DATASET_INSIGHT_PROMPT = """ +{description} # Instruction Propose insights to help improve the performance of the model on this dataset. @@ -70,6 +80,45 @@ Your model choices should be advanced enough to be helpful. ``` """ + +INSIGHT_PROPOSAL_PROMPT = """ +You are an AI assistant tasked with analyzing a machine learning solution and proposing new insights to improve its performance. Given the current solution code and development score, suggest innovative approaches to enhance the model. + +Current Solution Code: +{solution_code} + +Development Score: {dev_score} + +Based on this information, propose 3-5 new insights across different aspects of the machine learning pipeline (Data Preprocessing, Feature Engineering, and Model Training). Your insights should be specific, actionable, and have the potential to improve the model's performance. + +Please format your response as a JSON array with the following structure: +[ + + {{ + "task_type": "Data Preprocessing", + "insights": [ + "insight1", + "insight2" + ] + }}, + {{ + "task_type": "Feature Engineering", + "insights": [ + "insight1", + "insight2" + ] + }}, + {{ + "task_type": "Model Training", + "insights": [ + "insight1", + "insight2" + ] + }} +] +""" + + KEY_DATASET_FEATURES = [ "NumberOfClasses", "NumberOfFeatures", @@ -86,18 +135,32 @@ TASK_TO_ID = {"EDA": 1, "Data Preprocessing": 2, "Feature Engineering": 3, "Mode class SolutionDesigner: data_dir: str = DATA_CONFIG["datasets_dir"] - async def generate_solutions(self, dataset_info, dataset_name): + async def generate_solutions(self, dataset_info, dataset_name, save_analysis_pool=True): llm = LLM() - context = DATASET_INSIGHT_PROMPT.format( - dataset=dataset_info["description"], - metadata=self.metadata_builder(dataset_info["metadata"]), - head=dataset_info["df_head"], - ) + if type(dataset_info) == dict: + description_prompt = DATASET_DESCRIPTION_SELA_PROMPT.format( + dataset=dataset_info["description"], + metadata=self.metadata_builder(dataset_info["metadata"]), + head=dataset_info["df_head"], + ) + else: + description_prompt = DATASET_DESCRIPTION_CUSTOM_PROMPT.format(dataset_description=dataset_info) + context = DATASET_INSIGHT_PROMPT.format(description=description_prompt) rsp = await llm.aask(context) rsp = clean_json_from_rsp(rsp) analysis_pool = self.process_analysis_pool(json.loads(rsp)) - dataset_path = f"{self.data_dir}/{dataset_name}" - self.save_analysis_pool(dataset_path, analysis_pool) + if save_analysis_pool: + dataset_path = f"{self.data_dir}/{dataset_name}" + self.save_analysis_pool(dataset_path, analysis_pool) + return analysis_pool + + async def propose_new_insights(self, solution, score): + llm = LLM() + context = INSIGHT_PROPOSAL_PROMPT.format(solution_code=solution, dev_score=score) + rsp = await llm.aask(context) + rsp = clean_json_from_rsp(rsp) + new_insights = self.process_analysis_pool(json.loads(rsp)) + return new_insights def process_analysis_pool(self, insights_rsp): analysis_pool = [] diff --git a/expo/research_assistant.py b/expo/research_assistant.py index fb34ece38..d068dd4e5 100644 --- a/expo/research_assistant.py +++ b/expo/research_assistant.py @@ -13,15 +13,19 @@ from metagpt.roles.di.data_interpreter import DataInterpreter from metagpt.schema import Message, Task, TaskResult from metagpt.utils.common import CodeParser, write_json_file -EXTRACT_SCORE_PROMPT = """ -# Code: +CODE_BLOCK_RESULT = """ +## Code: {code} -# Execution Result: +## Execution Result: {result} +""" +EXTRACT_SCORE_PROMPT = """ +# Code Blocks +{code_block} # Instruction: -Based on the code and execution result, please extract the scores and return it as a dictionary. +Based on the code and execution result, please extract the **final scores** and return it as a dictionary. If you cannot find the scores, please still return a dictionary with the keys 'train_score', 'dev_score', and 'test_score', and set the values to -1. # Format: @@ -109,9 +113,17 @@ class ResearchAssistant(DataInterpreter): return score_dict async def llm_extract_score(self): - result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result - code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code - rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code=code_text, result=result_text, role="user")) + # result_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].result + # code_text = self.planner.plan.task_map[str(len(self.planner.plan.task_map))].code + num_tasks = len(self.planner.plan.task_map) + task_map = self.planner.plan.task_map + code_block = "\n".join( + [ + CODE_BLOCK_RESULT.format(code=task_map[str(i + 1)].code, result=task_map[str(i + 1)].result) + for i in range(num_tasks) + ] + ) + rsp = await self.llm.aask(EXTRACT_SCORE_PROMPT.format(code_block=code_block, role="user")) json_block = CodeParser.parse_code(block=None, text=rsp) score_dict = json.loads(json_block) return score_dict @@ -139,6 +151,11 @@ class ResearchAssistant(DataInterpreter): save_notebook(role=self, save_dir=self.role_dir, name=self.get_node_name()) return task_result + def get_solution(self): + codes = [task.code for task in self.planner.plan.tasks] + results = [task.result for task in self.planner.plan.tasks] + return {"codes": codes, "results": results} + def save_state(self, static_save=False): """ attribute: @@ -156,7 +173,7 @@ class ResearchAssistant(DataInterpreter): stg_path = self.role_dir name = self.get_node_name() role_path = os.path.join(stg_path, f"{name}.json") - # 将状态保存为 JSON 文件 + # save state as json file write_json_file(role_path, self.model_dump()) def remap_tasks(self): diff --git a/expo/run_experiment.py b/expo/run_experiment.py index 15be27d60..68c3b35d4 100644 --- a/expo/run_experiment.py +++ b/expo/run_experiment.py @@ -1,6 +1,7 @@ import argparse import asyncio +from expo.data.custom_task import get_mle_is_lower_better, get_mle_task_id from expo.experimenter.aug import AugExperimenter from expo.experimenter.autogluon import GluonExperimenter from expo.experimenter.autosklearn import AutoSklearnExperimenter @@ -9,7 +10,7 @@ from expo.experimenter.experimenter import Experimenter from expo.experimenter.mcts import MCTSExperimenter -def get_args(): +def get_args(cmd=True): parser = argparse.ArgumentParser() parser.add_argument("--name", type=str, default="") parser.add_argument( @@ -22,7 +23,18 @@ def get_args(): get_di_args(parser) get_mcts_args(parser) get_aug_exp_args(parser) - return parser.parse_args() + if cmd: + args = parser.parse_args() + else: + args = parser.parse_args("") + + if args.custom_dataset_dir: + args.external_eval = False + args.eval_func = "mlebench" + args.from_scratch = True + args.task = get_mle_task_id(args.custom_dataset_dir) + args.low_is_better = get_mle_is_lower_better(args.task) + return args def get_mcts_args(parser): @@ -31,7 +43,17 @@ def get_mcts_args(parser): parser.set_defaults(load_tree=False) parser.add_argument("--rollouts", type=int, default=5) parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true") + parser.set_defaults(use_fixed_insights=False) parser.add_argument("--start_task_id", type=int, default=2) + parser.add_argument( + "--from_scratch", dest="from_scratch", action="store_true", help="Generate solutions from scratch" + ) + parser.set_defaults(from_scratch=False) + parser.add_argument("--no_external_eval", dest="external_eval", action="store_false") + parser.set_defaults(external_eval=True) + parser.add_argument("--eval_func", type=str, default="sela", choices=["sela", "mlebench"]) + parser.add_argument("--custom_dataset_dir", type=str, default=None) + parser.add_argument("--max_depth", type=int, default=4) def get_aug_exp_args(parser): diff --git a/expo/scripts/visualize_experiment.py b/expo/scripts/visualize_experiment.py new file mode 100644 index 000000000..42b4490ec --- /dev/null +++ b/expo/scripts/visualize_experiment.py @@ -0,0 +1,23 @@ +import networkx as nx + +from expo.evaluation.visualize_mcts import build_tree_recursive, visualize_tree +from expo.MCTS import MCTS, create_initial_state, initialize_di_root_node +from expo.run_experiment import get_args +from expo.utils import DATA_CONFIG + +if __name__ == "__main__": + args = get_args() + data_config = DATA_CONFIG + state = create_initial_state(args.task, 0, data_config, args=args) + role, node = initialize_di_root_node(state) + mcts = MCTS( + root_node=node, + max_depth=5, + use_fixed_insights=False, + ) + + mcts.load_tree() + root = mcts.root_node + G = nx.DiGraph() + build_tree_recursive(G, "0", root) + visualize_tree(G, save_path="results/tree.png") diff --git a/expo/utils.py b/expo/utils.py index b022879b0..21b311e7f 100644 --- a/expo/utils.py +++ b/expo/utils.py @@ -51,6 +51,8 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"): f"Dataset {task_name} not found in config file. Available datasets: {data_config['datasets'].keys()}" ) exp_pool_path = os.path.join(data_path, f"{pool_name}.json") + if not os.path.exists(exp_pool_path): + return None return exp_pool_path @@ -109,7 +111,7 @@ async def load_execute_notebook(role): codes = [task.code for task in tasks if task.code] executor = role.execute_code executor.nb = nbformat.v4.new_notebook() - executor.nb_client = NotebookClient(executor.nb, timeout=executor.timeout) + executor.nb_client = NotebookClient(executor.nb, timeout=role.role_timeout) # await executor.build() for code in codes: outputs, success = await executor.run(code)