diff --git a/expo/MCTS.py b/expo/MCTS.py index 9026e09b4..af50ff7a0 100644 --- a/expo/MCTS.py +++ b/expo/MCTS.py @@ -4,9 +4,9 @@ import os import pandas as pd from expo.research_assistant import ResearchAssistant from expo.insights.InsightGenerate import InsightGenerator -from expo.dataset import get_split_dataset_path +from expo.dataset import get_split_dataset_path, generate_task_requirement from expo.evaluation.evaluation import evaluate_score -from expo.utils import mcts_logger, load_execute_notebook, generate_task_requirement, get_exp_pool_path +from expo.utils import mcts_logger, load_execute_notebook, get_exp_pool_path from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender from metagpt.utils.common import write_json_file, read_json_file, format_trackback_info diff --git a/expo/data.yaml b/expo/data.yaml index d921e1ebf..df26e29e8 100644 --- a/expo/data.yaml +++ b/expo/data.yaml @@ -152,6 +152,6 @@ datasets: \ eval data. Do not plot or make any visualizations.\n" -work_dir: D:/work/MG-open/MetaGPT/workspace # path to the workspace directory +work_dir: ../workspace # path to the workspace directory role_dir: storage/team/environment/roles/ResearchAssistant_David # analysis_pool_dir: D:/work/MG-open/MetaGPT/examples/MCTS_test/analysis_pool_sample.json \ No newline at end of file diff --git a/expo/dataset.py b/expo/dataset.py index 4bce6e9fe..a507d0b7e 100644 --- a/expo/dataset.py +++ b/expo/dataset.py @@ -5,7 +5,7 @@ import os import json import yaml import pandas as pd -from examples.MCTS_test.insights.solution_designer import SolutionDesigner +from expo.insights.solution_designer import SolutionDesigner import asyncio BASE_USER_REQUIREMENT = """\ @@ -14,6 +14,35 @@ Perform data analysis, data preprocessing, feature engineering, and modeling to Report {metric} on the eval data. Do not plot or make any visualizations. """ +TASK_PROMPT = """\ +# User requirement +{user_requirement} +**Attention** Please do not leak the target label in any form during training. + +## Saving Dev and Test Predictions +Save the prediction results of the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory BEFORE printig out the results. +The file should contain a single `target` column with the predicted values. +Make sure the prediction results are in the same format as the target column in the training set. The labels should be transformed back to the original format if any transformation was applied during training. + +## Output Training Set Performance +Make sure the performance of the model is printed in python in the last step even if it has been printed in the previous steps. The value should be a float number. +Print the training set performance in the last step. Write in this format: +```python +... +print("Train score:", train_score) +``` + +# Data dir +training: {train_path} +dev: {dev_path} +testing: {test_path} + +# Output dir +{output_dir} + +""" + + SEED = 100 TRAIN_TEST_SPLIT = 0.8 TRAIN_DEV_SPLIT = 0.75 @@ -89,6 +118,20 @@ def create_dataset_dict(dataset): } return dataset_dict +def generate_task_requirement(task_name, data_config): + user_requirement = get_user_requirement(task_name, data_config) + split_dataset_path = get_split_dataset_path(task_name, data_config) + train_path = split_dataset_path["train"] + dev_path = split_dataset_path["dev_wo_target"] + test_path = split_dataset_path["test_wo_target"] + work_dir = data_config["work_dir"] + output_dir = f"{work_dir}/{task_name}" + user_requirement = TASK_PROMPT.format(user_requirement=user_requirement, + train_path=train_path, dev_path=dev_path, test_path=test_path, + output_dir=output_dir) + return user_requirement + + class ExpDataset: description : str = None metadata : dict = None diff --git a/expo/insights/InsightGenerate.py b/expo/insights/InsightGenerate.py index de58b7e4e..55ab64e30 100644 --- a/expo/insights/InsightGenerate.py +++ b/expo/insights/InsightGenerate.py @@ -23,7 +23,7 @@ import random import json from metagpt.llm import LLM from metagpt.schema import Message -from examples.MCTS_test.utils import load_data_config, mcts_logger +from expo.utils import load_data_config, mcts_logger DATA_CONFIG = load_data_config() diff --git a/expo/insights/solution_designer.py b/expo/insights/solution_designer.py index 0986c392a..e2bf57ae3 100644 --- a/expo/insights/solution_designer.py +++ b/expo/insights/solution_designer.py @@ -3,7 +3,7 @@ import random import json from metagpt.llm import LLM from metagpt.schema import Message -from examples.MCTS_test.utils import clean_json_from_rsp, load_data_config +from expo.utils import clean_json_from_rsp, load_data_config DATA_CONFIG = load_data_config() diff --git a/expo/research_assistant.py b/expo/research_assistant.py index fbd74f7db..7b844cf5e 100644 --- a/expo/research_assistant.py +++ b/expo/research_assistant.py @@ -10,6 +10,9 @@ from metagpt.utils.common import write_json_file, read_json_file, format_trackba from metagpt.const import MESSAGE_ROUTE_TO_ALL, SERDESER_PATH from metagpt.utils.recovery_util import save_history from expo.utils import mcts_logger, save_notebook +from pydantic import Field, model_validator +from metagpt.actions.di.write_analysis_code import CheckData, WriteAnalysisCode + import re import os @@ -84,6 +87,17 @@ class ResearchAssistant(DataInterpreter): json_block = CodeParser.parse_code(block=None, text=rsp) score_dict = json.loads(json_block) return score_dict + + + @model_validator(mode="after") + def set_plan_and_tool(self) -> "Interpreter": + if self.planner.plan.goal != '': + self.set_actions([WriteAnalysisCode]) + self._set_state(0) + print("Plan already exists, skipping initialization.") + return self + print("Initializing plan and tool...") + return super().set_plan_and_tool() async def _act_on_task(self, current_task: Task) -> TaskResult: """Useful in 'plan_and_act' mode. Wrap the output in a TaskResult for review and confirmation.""" diff --git a/expo/run_exp_augmentation.py b/expo/run_exp_augmentation.py index 492a424d4..f4d22093f 100644 --- a/expo/run_exp_augmentation.py +++ b/expo/run_exp_augmentation.py @@ -1,10 +1,11 @@ import os -from metagpt.roles.di.research_assistant import ResearchAssistant +from expo.research_assistant import ResearchAssistant import asyncio -from examples.MCTS_test.utils import DATA_CONFIG, generate_task_requirement, get_exp_pool_path -from examples.MCTS_test.insights.InsightGenerate import InsightGenerator -from examples.MCTS_test.MCTS import create_initial_state -from examples.MCTS_test.evaluation.evaluation import evaluate_score +from expo.utils import DATA_CONFIG, get_exp_pool_path +from expo.dataset import generate_task_requirement +from expo.insights.InsightGenerate import InsightGenerator +from expo.MCTS import create_initial_state +from expo.evaluation.evaluation import evaluate_score import json import argparse import pandas as pd diff --git a/expo/run_experiment.py b/expo/run_experiment.py index e75897f5a..0c7468ac9 100644 --- a/expo/run_experiment.py +++ b/expo/run_experiment.py @@ -1,6 +1,7 @@ -from examples.MCTS_test.MCTS import MCTS, Node, initialize_di_root_node -from examples.MCTS_test.utils import load_data_config, generate_task_requirement -from examples.MCTS_test.visualize_mcts import get_tree_text +from expo.MCTS import MCTS, Node, initialize_di_root_node +from expo.utils import load_data_config +from expo.dataset import generate_task_requirement +from expo.evaluation.visualize_mcts import get_tree_text import asyncio import argparse diff --git a/expo/run_mcts.py b/expo/run_mcts.py index 0c0c486db..6d2c421ec 100644 --- a/expo/run_mcts.py +++ b/expo/run_mcts.py @@ -1,5 +1,7 @@ from expo.MCTS import MCTS, Node, initialize_di_root_node -from expo.utils import load_data_config, generate_task_requirement +from expo.utils import load_data_config +from expo.dataset import generate_task_requirement + from expo.evaluation.visualize_mcts import get_tree_text import asyncio import argparse diff --git a/expo/utils.py b/expo/utils.py index ac4a64697..423889f29 100644 --- a/expo/utils.py +++ b/expo/utils.py @@ -1,5 +1,4 @@ import yaml -from examples.MCTS_test.dataset import get_user_requirement, get_split_dataset_path from metagpt.roles.role import Role from metagpt.actions.di.execute_nb_code import ExecuteNbCode from metagpt.utils.save_code import save_code_file @@ -13,34 +12,6 @@ import sys import os import re -TASK_PROMPT = """\ -# User requirement -{user_requirement} -**Attention** Please do not leak the target label in any form during training. - -## Saving Dev and Test Predictions -Save the prediction results of the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory BEFORE printig out the results. -The file should contain a single `target` column with the predicted values. -Make sure the prediction results are in the same format as the target column in the training set. The labels should be transformed back to the original format if any transformation was applied during training. - -## Output Training Set Performance -Make sure the performance of the model is printed in python in the last step even if it has been printed in the previous steps. The value should be a float number. -Print the training set performance in the last step. Write in this format: -```python -... -print("Train score:", train_score) -``` - -# Data dir -training: {train_path} -dev: {dev_path} -testing: {test_path} - -# Output dir -{output_dir} - -""" - def load_data_config(file_path="data.yaml"): with open(file_path, 'r') as stream: data_config = yaml.safe_load(stream) @@ -78,18 +49,6 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"): exp_pool_path = os.path.join(data_path, f"{pool_name}.json") return exp_pool_path -def generate_task_requirement(task_name, data_config): - user_requirement = get_user_requirement(task_name, data_config) - split_dataset_path = get_split_dataset_path(task_name, data_config) - train_path = split_dataset_path["train"] - dev_path = split_dataset_path["dev_wo_target"] - test_path = split_dataset_path["test_wo_target"] - work_dir = data_config["work_dir"] - output_dir = f"{work_dir}/{task_name}" - user_requirement = TASK_PROMPT.format(user_requirement=user_requirement, - train_path=train_path, dev_path=dev_path, test_path=test_path, - output_dir=output_dir) - return user_requirement def change_plan(role, plan): print(f"Change next plan to: {plan}")