From 32759f031c4c2e489e746c7c16d2e5017cdbebcc Mon Sep 17 00:00:00 2001 From: Yizhou Chi Date: Fri, 30 Aug 2024 19:55:40 +0800 Subject: [PATCH] add experimenter --- expo/MCTS.py | 24 +++++---- expo/dataset.py | 5 +- expo/experimenter/__init__.py | 3 ++ expo/experimenter/aug.py | 60 ++++++++++++++++++++++ expo/experimenter/aug_experimenter.py | 0 expo/experimenter/experimenter.py | 61 +++++++++++++++++++---- expo/experimenter/mcts.py | 44 ++++++++++++++++ expo/experimenter/mcts_experimenter.py | 0 expo/insights/InsightGenerate.py | 15 +----- expo/research_assistant.py | 2 - expo/run_experiment.py | 18 ++++--- expo/run_mcts.py | 9 ++-- expo/utils.py | 4 +- metagpt/prompts/di/write_analysis_code.py | 2 +- 14 files changed, 196 insertions(+), 51 deletions(-) create mode 100644 expo/experimenter/__init__.py create mode 100644 expo/experimenter/aug.py delete mode 100644 expo/experimenter/aug_experimenter.py create mode 100644 expo/experimenter/mcts.py delete mode 100644 expo/experimenter/mcts_experimenter.py diff --git a/expo/MCTS.py b/expo/MCTS.py index af50ff7a0..5c502f917 100644 --- a/expo/MCTS.py +++ b/expo/MCTS.py @@ -169,8 +169,8 @@ class Node(): # return evaluate_score(predictions, gt, metric) def evaluate_prediction(self, split): - pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}-predictions.csv") - pred_node_path = os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}-predictions.csv") + pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv") + pred_node_path = os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv") gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"]) preds = pd.read_csv(pred_path)["target"] preds.to_csv(pred_node_path, index=False) @@ -201,12 +201,12 @@ class Node(): score_dict = await role.get_score() score_dict = self.evaluate_simulation(score_dict) self.raw_reward = score_dict - if self.state["low_is_better"]: # normalized the score to be between 0 and 1, and higher is better def normalize_score(score): return 1 / (1 + score) score_dict = {k: normalize_score(v) for k, v in score_dict.items()} + self.normalized_reward = score_dict return score_dict @@ -262,19 +262,23 @@ class MCTS(): def best_path(self, root : Node): best_child = root best_score = 0 - def bfs(node : Node, best_score, best_child : Node): + def bfs(node : Node, best_score, best_child : Node, split): + assert split in ["test_score", "dev_score"] if node not in self.children: return best_score, best_child for child in self.children[node]: - print(child.id, child.raw_value) - if child.raw_value > best_score: - best_score = child.raw_value + score = child.normalized[split] + print(child.id, score) + if score > best_score: + best_score = score best_child = child best_score, best_child = bfs(child, best_score, best_child) return best_score, best_child - best_score, best_child = bfs(root, best_score, best_child) - mcts_logger.log("MCTS", f"Best Score: {best_score}, Best Node ID: {best_child.id}") - return best_child + _, best_child = bfs(root, best_score, best_child, "test_score") + _, dev_best_child = bfs(root, best_score, best_child, "dev_score") + + return {"dev_best": dev_best_child, + "global_best": best_child} def get_num_simulations(self): return self.root_node.visited diff --git a/expo/dataset.py b/expo/dataset.py index a507d0b7e..3f3fa1db1 100644 --- a/expo/dataset.py +++ b/expo/dataset.py @@ -20,8 +20,8 @@ TASK_PROMPT = """\ **Attention** Please do not leak the target label in any form during training. ## Saving Dev and Test Predictions -Save the prediction results of the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory BEFORE printig out the results. -The file should contain a single `target` column with the predicted values. +Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. +Both files should contain a single `target` column with the predicted values. Make sure the prediction results are in the same format as the target column in the training set. The labels should be transformed back to the original format if any transformation was applied during training. ## Output Training Set Performance @@ -129,6 +129,7 @@ def generate_task_requirement(task_name, data_config): user_requirement = TASK_PROMPT.format(user_requirement=user_requirement, train_path=train_path, dev_path=dev_path, test_path=test_path, output_dir=output_dir) + print(user_requirement) return user_requirement diff --git a/expo/experimenter/__init__.py b/expo/experimenter/__init__.py new file mode 100644 index 000000000..d6b50c9d2 --- /dev/null +++ b/expo/experimenter/__init__.py @@ -0,0 +1,3 @@ +from .experimenter import Experimenter +from .mcts import MCTSExperimenter +from .aug import AugExperimenter \ No newline at end of file diff --git a/expo/experimenter/aug.py b/expo/experimenter/aug.py new file mode 100644 index 000000000..28643d47f --- /dev/null +++ b/expo/experimenter/aug.py @@ -0,0 +1,60 @@ +from experimenter import Experimenter +from expo.MCTS import create_initial_state +from expo.dataset import generate_task_requirement +from expo.utils import mcts_logger, load_execute_notebook, get_exp_pool_path +from expo.insights.InsightGenerate import InsightGenerator +from expo.research_assistant import ResearchAssistant + +EXPS_PROMPT = """ +When doing the tasks, you can refer to the insights below: +{experience} + +""" + + + + +class AugExperimenter(Experimenter): + result_path : str = "results/aug" + + async def run_experiment(self): + state = create_initial_state(self.args.task, start_task_id=1, data_config=self.data_config, low_is_better=self.args.low_is_better, name="") + user_requirement = state["requirement"] + exp_pool_path = get_exp_pool_path(self.args.task, self.data_config, pool_name="ds_analysis_pool") + exp_pool = InsightGenerator.load_analysis_pool(exp_pool_path) + if self.args.aug_mode == "single": + exps = InsightGenerator._random_sample(exp_pool, self.args.num_experiments) + exps = [exp["Analysis"] for exp in exps] + elif self.args.aug_mode == "set": + exp_set = InsightGenerator.sample_instruction_set(exp_pool) + exp_set_text = "\n".join([f"{exp['task_id']}: {exp['Analysis']}" for exp in exp_set]) + exps = [exp_set_text] * self.args.num_experiments + else: + raise ValueError(f"Invalid mode: {self.args.aug_mode}") + + results = [] + for i in range(self.args.num_experiments): + di = ResearchAssistant(node_id=str(i), use_reflection=self.args.use_reflection) + di.role_dir = f"{di.role_dir}_{self.args.task}" + requirement = user_requirement + EXPS_PROMPT.format(experience=exps[i]) + print(requirement) + await di.run(requirement) + score_dict = await di.get_score(low_is_better=False) + score_dict = self.evaluate(score_dict, state) + results.append({ + "idx": i, + "score_dict": score_dict, + "aug_mode": self.args.aug_mode, + "insights" : exps[i], + "user_requirement": user_requirement, + "args": self.args + }) + scores = [score_dict["test_score"] for score_dict in scores] + avg_score = sum(scores) / len(scores) + best_score = max(scores) if not self.args.low_is_better else min(scores) + best_score_idx = scores.index(best_score) + results.insert(0, {"avg_score": avg_score, "best_score": best_score, "best_score_idx": best_score_idx}) + self.save_results(results) + + + \ No newline at end of file diff --git a/expo/experimenter/aug_experimenter.py b/expo/experimenter/aug_experimenter.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py index d7ed82070..092af3694 100644 --- a/expo/experimenter/experimenter.py +++ b/expo/experimenter/experimenter.py @@ -1,18 +1,57 @@ +from expo.utils import DATA_CONFIG +import os +import pandas as pd +from expo.evaluation.evaluation import evaluate_score +import datetime +import json +from expo.MCTS import create_initial_state +from expo.research_assistant import ResearchAssistant + class Experimenter: result_path : str = "results" + data_config = DATA_CONFIG + + + def __init__(self, args, **kwargs): + self.args = args + self.start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") async def run_experiment(self): - pass - + state = create_initial_state(self.args.task, start_task_id=1, data_config=self.data_config, low_is_better=self.args.low_is_better, name="") + user_requirement = state["requirement"] + di = ResearchAssistant(node_id="0", use_reflection=self.args.use_reflection) + await di.run(user_requirement) - def save_scores(self): - pass - - def save_result(self): + score_dict = await di.get_score(low_is_better=False) + score_dict = self.evaluate(score_dict, state) results = { - "test_score": self.test_score, - "num_experiments": self.num_experiments, - "insights": self.insights, - "avg_score": self.avg_score, - } \ No newline at end of file + "score_dict": score_dict, + "aug_mode": self.args.aug_mode, + "user_requirement": user_requirement, + "args": self.args + } + self.save_result(results) + + def evaluate_prediction(self, split, state): + pred_path = os.path.join(state["work_dir"], state["task"], f"{split}_predictions.csv") + pred_node_path = os.path.join(state["node_dir"], f"{self.start_time}-{split}_predictions.csv") + gt_path = os.path.join(state["datasets_dir"][f"{split}_target"]) + preds = pd.read_csv(pred_path)["target"] + preds.to_csv(pred_node_path, index=False) + gt = pd.read_csv(gt_path)["target"] + metric = state["dataset_config"]["metric"] + return evaluate_score(preds, gt, metric) + + def evaluate(self, score_dict, state): + scores = { + "dev_score": self.evaluate_prediction("dev", state), + "test_score": self.evaluate_prediction("test", state), + } + score_dict.update(scores) + return score_dict + + + def save_result(self, result): + with open(f"{self.result_path}/{self.args.task}_{self.start_time}.json", "w") as f: + json.dump(result, f, indent=4) diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py new file mode 100644 index 000000000..2523588b9 --- /dev/null +++ b/expo/experimenter/mcts.py @@ -0,0 +1,44 @@ +from expo.experimenter import Experimenter +from expo.dataset import generate_task_requirement +from expo.MCTS import MCTS +from expo.evaluation.visualize_mcts import get_tree_text + + +class MCTSExperimenter(Experimenter): + result_path : str = "results/mcts" + async def run_experiment(self): + mcts = MCTS(root_node=None, max_depth=5) + best_nodes = await mcts.search(self.args.task, self.data_config, + low_is_better=self.args.low_is_better, + load_tree=self.args.load_tree, + reflection=self.args.reflection, + rollout=self.args.rollout, + name=self.args.name) + best_node = best_nodes["global_best"] + dev_best_node = best_nodes["dev_best"] + + text, num_generated_codes = get_tree_text(mcts.root_node) + text += f"Generated {num_generated_codes} unique codes.\n" + text += f"Best node: {best_node}, score: {best_node.raw_reward}\n" + text += f"Dev best node: {dev_best_node}, score: {dev_best_node.raw_reward}\n" + print(text) + self.save_tree(text) + + results = { + "best_node": best_node, + "best_node_score": best_node.raw_reward, + "dev_best_node": dev_best_node, + "dev_best_node_score": dev_best_node.raw_reward, + "num_generated_codes": num_generated_codes, + "user_requirement": best_node.state["requirement"], + "args": self.args + } + self.save_result(results) + + + + def save_tree(self, tree_text): + fpath = f"{self.result_path}/{self.args.task}_tree_{self.args.name}.txt" + with open(fpath, "w") as f: + f.write(tree_text) + diff --git a/expo/experimenter/mcts_experimenter.py b/expo/experimenter/mcts_experimenter.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/expo/insights/InsightGenerate.py b/expo/insights/InsightGenerate.py index 55ab64e30..35bed976a 100644 --- a/expo/insights/InsightGenerate.py +++ b/expo/insights/InsightGenerate.py @@ -23,7 +23,7 @@ import random import json from metagpt.llm import LLM from metagpt.schema import Message -from expo.utils import load_data_config, mcts_logger +from expo.utils import load_data_config, mcts_logger, clean_json_from_rsp DATA_CONFIG = load_data_config() @@ -52,17 +52,6 @@ class InsightGenerator: for task_id in sorted(data_dict.keys()): instruction_set.append(random.choice(data_dict[task_id])) return instruction_set - - - @staticmethod - def clean_json_from_rsp(text): - pattern = r"```json(.*?)```" - matches = re.findall(pattern, text, re.DOTALL) - if matches: - json_str = "\n".join(matches) - return json_str - else: - return "" @staticmethod def format_output(rsp): @@ -109,6 +98,6 @@ class InsightGenerator: llm_response = await llm.aask( context, system_msgs=[REFLECTION_SYSTEM_MSG] ) - rsp = InsightGenerator.clean_json_from_rsp(llm_response) + rsp = clean_json_from_rsp(llm_response) new_instruction = json.loads(rsp)["New Instruction"] return new_instruction \ No newline at end of file diff --git a/expo/research_assistant.py b/expo/research_assistant.py index 7b844cf5e..c26f24586 100644 --- a/expo/research_assistant.py +++ b/expo/research_assistant.py @@ -8,7 +8,6 @@ from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender from metagpt.utils.common import CodeParser from metagpt.utils.common import write_json_file, read_json_file, format_trackback_info from metagpt.const import MESSAGE_ROUTE_TO_ALL, SERDESER_PATH -from metagpt.utils.recovery_util import save_history from expo.utils import mcts_logger, save_notebook from pydantic import Field, model_validator from metagpt.actions.di.write_analysis_code import CheckData, WriteAnalysisCode @@ -126,7 +125,6 @@ class ResearchAssistant(DataInterpreter): role_path = os.path.join(stg_path, f"{name}.json") # 将状态保存为 JSON 文件 write_json_file(role_path, self.model_dump()) - save_history(role=self, save_dir=stg_path, name=name) def remap_tasks(self): diff --git a/expo/run_experiment.py b/expo/run_experiment.py index 0c7468ac9..1704faa06 100644 --- a/expo/run_experiment.py +++ b/expo/run_experiment.py @@ -1,7 +1,4 @@ -from expo.MCTS import MCTS, Node, initialize_di_root_node -from expo.utils import load_data_config -from expo.dataset import generate_task_requirement -from expo.evaluation.visualize_mcts import get_tree_text +from expo.experimenter import MCTSExperimenter, Experimenter, AugExperimenter import asyncio import argparse @@ -9,11 +6,10 @@ import argparse def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--name", type=str, default="") + parser.add_argument("--exp_mode", type=str, default="mcts", choices=["mcts", "aug", "base"]) get_di_args(parser) get_mcts_args(parser) get_aug_exp_args(parser) - - return parser.parse_args() @@ -38,7 +34,15 @@ def get_di_args(parser): async def main(args): - pass + if args.exp_mode == "mcts": + experimenter = MCTSExperimenter(args) + elif args.exp_mode == "aug": + experimenter = AugExperimenter(args) + elif args.exp_mode == "base": + experimenter = Experimenter(args) + else: + raise ValueError(f"Invalid exp_mode: {args.exp_mode}") + await experimenter.run_experiment() if __name__ == "__main__": args = get_args() diff --git a/expo/run_mcts.py b/expo/run_mcts.py index 6d2c421ec..7b2e2b4da 100644 --- a/expo/run_mcts.py +++ b/expo/run_mcts.py @@ -27,17 +27,19 @@ data_config = load_data_config() if __name__ == "__main__": args = get_args() - requirement = generate_task_requirement(args.task, data_config) - print(requirement) + # requirement = generate_task_requirement(args.task, data_config) + # print(requirement) # role, root_node = initialize_di_root_node(requirement, data_config) # asyncio.run(role.run(requirement)) # asyncio.run(root_node.run_node()) mcts = MCTS(root_node=None, max_depth=5) - best_node = asyncio.run(mcts.search(args.task, data_config, + best_nodes = asyncio.run(mcts.search(args.task, data_config, low_is_better=args.low_is_better, load_tree=args.load_tree, reflection=args.reflection, rollout=args.rollout, name=args.name)) + best_node = best_nodes["global_best"] + dev_best_node = best_nodes["dev_best"] text, num_generated_codes = get_tree_text(mcts.root_node) print(text) print(f"Generated {num_generated_codes} unique codes.") @@ -45,6 +47,7 @@ if __name__ == "__main__": with open(f"results/{args.task}_tree{args.name}.txt", "w") as f: f.write(f"Generated {num_generated_codes} unique codes.\n") f.write(f"Best node: {best_node}, score: {best_node.raw_reward}\n") + f.write(f"Dev best node: {dev_best_node}, score: {dev_best_node.raw_reward}\n") f.write(text) diff --git a/expo/utils.py b/expo/utils.py index 423889f29..20e3fa7f5 100644 --- a/expo/utils.py +++ b/expo/utils.py @@ -1,7 +1,6 @@ import yaml from metagpt.roles.role import Role from metagpt.actions.di.execute_nb_code import ExecuteNbCode -from metagpt.utils.save_code import save_code_file # from nbclient import NotebookClient from nbformat.notebooknode import NotebookNode import nbformat @@ -86,7 +85,8 @@ def process_cells(nb: NotebookNode) -> NotebookNode: def save_notebook(role: Role, save_dir: str = "", name: str = ""): save_dir = Path(save_dir) nb = process_cells(role.execute_code.nb) - save_code_file(name=name, code_context=nb, file_format="ipynb", save_dir=save_dir) + file_path = save_dir / f"{name}.ipynb" + nbformat.write(nb, file_path) async def load_execute_notebook(role): tasks = role.planner.plan.tasks diff --git a/metagpt/prompts/di/write_analysis_code.py b/metagpt/prompts/di/write_analysis_code.py index f8b9a4c42..beee80679 100644 --- a/metagpt/prompts/di/write_analysis_code.py +++ b/metagpt/prompts/di/write_analysis_code.py @@ -69,7 +69,7 @@ Output a json following the format: ```json {{ "reflection": str = "Reflection on previous implementation", - "improved_impl": str = "Refined code after reflection.", + "improved_impl": str = "Refined code after reflection (do not include nested code block here).", }} ``` """