1. change data.yaml to more generalized path

2. correct import
2026-06-11 15:15:18 +02:00 · 2024-08-30 16:59:38 +08:00 · 2024-08-30 16:59:38 +08:00 · d14f07f9b1
commit d14f07f9b1
parent 211f758b53
10 changed files with 76 additions and 56 deletions
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@ -4,9 +4,9 @@ import os
 import pandas as pd
 from expo.research_assistant import ResearchAssistant
 from expo.insights.InsightGenerate import InsightGenerator
-from expo.dataset import get_split_dataset_path
+from expo.dataset import get_split_dataset_path, generate_task_requirement
 from expo.evaluation.evaluation import evaluate_score
-from expo.utils import mcts_logger, load_execute_notebook, generate_task_requirement, get_exp_pool_path
+from expo.utils import mcts_logger, load_execute_notebook, get_exp_pool_path

 from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender
 from metagpt.utils.common import write_json_file, read_json_file, format_trackback_info
--- a/expo/data.yaml
+++ b/expo/data.yaml
@ -152,6 +152,6 @@ datasets:
      \ eval data. Do not plot or make any visualizations.\n"


-work_dir: D:/work/MG-open/MetaGPT/workspace # path to the workspace directory
+work_dir: ../workspace # path to the workspace directory
 role_dir: storage/team/environment/roles/ResearchAssistant_David
 # analysis_pool_dir: D:/work/MG-open/MetaGPT/examples/MCTS_test/analysis_pool_sample.json
--- a/expo/dataset.py
+++ b/expo/dataset.py
@ -5,7 +5,7 @@ import os
 import json
 import yaml
 import pandas as pd
-from examples.MCTS_test.insights.solution_designer import SolutionDesigner
+from expo.insights.solution_designer import SolutionDesigner
 import asyncio

 BASE_USER_REQUIREMENT = """\
@ -14,6 +14,35 @@ Perform data analysis, data preprocessing, feature engineering, and modeling to
 Report {metric} on the eval data. Do not plot or make any visualizations.
 """

+TASK_PROMPT = """\
+# User requirement
+{user_requirement}
+**Attention** Please do not leak the target label in any form during training.
+
+## Saving Dev and Test Predictions
+Save the prediction results of the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory BEFORE printig out the results. 
+The file should contain a single `target` column with the predicted values.
+Make sure the prediction results are in the same format as the target column in the training set. The labels should be transformed back to the original format if any transformation was applied during training.
+
+## Output Training Set Performance
+Make sure the performance of the model is printed in python in the last step even if it has been printed in the previous steps. The value should be a float number.
+Print the training set performance in the last step. Write in this format:
+```python
+...
+print("Train score:", train_score)
+```
+
+# Data dir
+training: {train_path}
+dev: {dev_path}
+testing: {test_path}
+
+# Output dir
+{output_dir}
+
+"""
+
+
 SEED = 100
 TRAIN_TEST_SPLIT = 0.8
 TRAIN_DEV_SPLIT = 0.75
@ -89,6 +118,20 @@ def create_dataset_dict(dataset):
    }
    return dataset_dict

+def generate_task_requirement(task_name, data_config):
+    user_requirement = get_user_requirement(task_name, data_config)
+    split_dataset_path = get_split_dataset_path(task_name, data_config)
+    train_path = split_dataset_path["train"]
+    dev_path = split_dataset_path["dev_wo_target"]
+    test_path = split_dataset_path["test_wo_target"]
+    work_dir = data_config["work_dir"]
+    output_dir = f"{work_dir}/{task_name}"
+    user_requirement = TASK_PROMPT.format(user_requirement=user_requirement, 
+                                          train_path=train_path, dev_path=dev_path, test_path=test_path,
+                                          output_dir=output_dir)
+    return user_requirement
+
+
 class ExpDataset:
    description : str = None
    metadata : dict = None
--- a/expo/insights/InsightGenerate.py
+++ b/expo/insights/InsightGenerate.py
@ -23,7 +23,7 @@ import random
 import json
 from metagpt.llm import LLM
 from metagpt.schema import Message
-from examples.MCTS_test.utils import load_data_config, mcts_logger
+from expo.utils import load_data_config, mcts_logger
 DATA_CONFIG = load_data_config()


--- a/expo/insights/solution_designer.py
+++ b/expo/insights/solution_designer.py
@ -3,7 +3,7 @@ import random
 import json
 from metagpt.llm import LLM
 from metagpt.schema import Message
-from examples.MCTS_test.utils import clean_json_from_rsp, load_data_config
+from expo.utils import clean_json_from_rsp, load_data_config


 DATA_CONFIG = load_data_config()
--- a/expo/research_assistant.py
+++ b/expo/research_assistant.py
@ -10,6 +10,9 @@ from metagpt.utils.common import write_json_file, read_json_file, format_trackba
 from metagpt.const import MESSAGE_ROUTE_TO_ALL, SERDESER_PATH
 from metagpt.utils.recovery_util import save_history
 from expo.utils import mcts_logger, save_notebook
+from pydantic import Field, model_validator
+from metagpt.actions.di.write_analysis_code import CheckData, WriteAnalysisCode
+
 import re
 import os

@ -84,6 +87,17 @@ class ResearchAssistant(DataInterpreter):
        json_block = CodeParser.parse_code(block=None, text=rsp)
        score_dict = json.loads(json_block)
        return score_dict
+    
+
+    @model_validator(mode="after")
+    def set_plan_and_tool(self) -> "Interpreter":
+        if self.planner.plan.goal != '':
+            self.set_actions([WriteAnalysisCode])
+            self._set_state(0)
+            print("Plan already exists, skipping initialization.")
+            return self
+        print("Initializing plan and tool...")
+        return super().set_plan_and_tool()

    async def _act_on_task(self, current_task: Task) -> TaskResult:
        """Useful in 'plan_and_act' mode. Wrap the output in a TaskResult for review and confirmation."""
--- a/expo/run_exp_augmentation.py
+++ b/expo/run_exp_augmentation.py
@ -1,10 +1,11 @@
 import os
-from metagpt.roles.di.research_assistant import ResearchAssistant
+from expo.research_assistant import ResearchAssistant
 import asyncio
-from examples.MCTS_test.utils import DATA_CONFIG, generate_task_requirement, get_exp_pool_path
-from examples.MCTS_test.insights.InsightGenerate import InsightGenerator
-from examples.MCTS_test.MCTS import create_initial_state
-from examples.MCTS_test.evaluation.evaluation import evaluate_score
+from expo.utils import DATA_CONFIG, get_exp_pool_path
+from expo.dataset import generate_task_requirement
+from expo.insights.InsightGenerate import InsightGenerator
+from expo.MCTS import create_initial_state
+from expo.evaluation.evaluation import evaluate_score
 import json
 import argparse
 import pandas as pd
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@ -1,6 +1,7 @@
-from examples.MCTS_test.MCTS import MCTS, Node, initialize_di_root_node
-from examples.MCTS_test.utils import load_data_config, generate_task_requirement
-from examples.MCTS_test.visualize_mcts import get_tree_text
+from expo.MCTS import MCTS, Node, initialize_di_root_node
+from expo.utils import load_data_config
+from expo.dataset import generate_task_requirement
+from expo.evaluation.visualize_mcts import get_tree_text
 import asyncio
 import argparse

--- a/expo/run_mcts.py
+++ b/expo/run_mcts.py
@ -1,5 +1,7 @@
 from expo.MCTS import MCTS, Node, initialize_di_root_node
-from expo.utils import load_data_config, generate_task_requirement
+from expo.utils import load_data_config
+from expo.dataset import generate_task_requirement
+
 from expo.evaluation.visualize_mcts import get_tree_text
 import asyncio
 import argparse
--- a/expo/utils.py
+++ b/expo/utils.py
@ -1,5 +1,4 @@
 import yaml
-from examples.MCTS_test.dataset import get_user_requirement, get_split_dataset_path
 from metagpt.roles.role import Role
 from metagpt.actions.di.execute_nb_code import ExecuteNbCode
 from metagpt.utils.save_code import save_code_file
@ -13,34 +12,6 @@ import sys
 import os
 import re

-TASK_PROMPT = """\
-# User requirement
-{user_requirement}
-**Attention** Please do not leak the target label in any form during training.
-
-## Saving Dev and Test Predictions
-Save the prediction results of the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory BEFORE printig out the results. 
-The file should contain a single `target` column with the predicted values.
-Make sure the prediction results are in the same format as the target column in the training set. The labels should be transformed back to the original format if any transformation was applied during training.
-
-## Output Training Set Performance
-Make sure the performance of the model is printed in python in the last step even if it has been printed in the previous steps. The value should be a float number.
-Print the training set performance in the last step. Write in this format:
-```python
-...
-print("Train score:", train_score)
-```
-
-# Data dir
-training: {train_path}
-dev: {dev_path}
-testing: {test_path}
-
-# Output dir
-{output_dir}
-
-"""
-
 def load_data_config(file_path="data.yaml"):
    with open(file_path, 'r') as stream:
        data_config = yaml.safe_load(stream)
@ -78,18 +49,6 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"):
    exp_pool_path = os.path.join(data_path, f"{pool_name}.json")
    return exp_pool_path

-def generate_task_requirement(task_name, data_config):
-    user_requirement = get_user_requirement(task_name, data_config)
-    split_dataset_path = get_split_dataset_path(task_name, data_config)
-    train_path = split_dataset_path["train"]
-    dev_path = split_dataset_path["dev_wo_target"]
-    test_path = split_dataset_path["test_wo_target"]
-    work_dir = data_config["work_dir"]
-    output_dir = f"{work_dir}/{task_name}"
-    user_requirement = TASK_PROMPT.format(user_requirement=user_requirement, 
-                                          train_path=train_path, dev_path=dev_path, test_path=test_path,
-                                          output_dir=output_dir)
-    return user_requirement

 def change_plan(role, plan):
    print(f"Change next plan to: {plan}")