1. change data.yaml to more generalized path

2. correct import
This commit is contained in:
Yizhou Chi 2024-08-30 16:59:38 +08:00
parent 211f758b53
commit d14f07f9b1
10 changed files with 76 additions and 56 deletions

View file

@ -4,9 +4,9 @@ import os
import pandas as pd
from expo.research_assistant import ResearchAssistant
from expo.insights.InsightGenerate import InsightGenerator
from expo.dataset import get_split_dataset_path
from expo.dataset import get_split_dataset_path, generate_task_requirement
from expo.evaluation.evaluation import evaluate_score
from expo.utils import mcts_logger, load_execute_notebook, generate_task_requirement, get_exp_pool_path
from expo.utils import mcts_logger, load_execute_notebook, get_exp_pool_path
from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender
from metagpt.utils.common import write_json_file, read_json_file, format_trackback_info

View file

@ -152,6 +152,6 @@ datasets:
\ eval data. Do not plot or make any visualizations.\n"
work_dir: D:/work/MG-open/MetaGPT/workspace # path to the workspace directory
work_dir: ../workspace # path to the workspace directory
role_dir: storage/team/environment/roles/ResearchAssistant_David
# analysis_pool_dir: D:/work/MG-open/MetaGPT/examples/MCTS_test/analysis_pool_sample.json

View file

@ -5,7 +5,7 @@ import os
import json
import yaml
import pandas as pd
from examples.MCTS_test.insights.solution_designer import SolutionDesigner
from expo.insights.solution_designer import SolutionDesigner
import asyncio
BASE_USER_REQUIREMENT = """\
@ -14,6 +14,35 @@ Perform data analysis, data preprocessing, feature engineering, and modeling to
Report {metric} on the eval data. Do not plot or make any visualizations.
"""
TASK_PROMPT = """\
# User requirement
{user_requirement}
**Attention** Please do not leak the target label in any form during training.
## Saving Dev and Test Predictions
Save the prediction results of the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory BEFORE printig out the results.
The file should contain a single `target` column with the predicted values.
Make sure the prediction results are in the same format as the target column in the training set. The labels should be transformed back to the original format if any transformation was applied during training.
## Output Training Set Performance
Make sure the performance of the model is printed in python in the last step even if it has been printed in the previous steps. The value should be a float number.
Print the training set performance in the last step. Write in this format:
```python
...
print("Train score:", train_score)
```
# Data dir
training: {train_path}
dev: {dev_path}
testing: {test_path}
# Output dir
{output_dir}
"""
SEED = 100
TRAIN_TEST_SPLIT = 0.8
TRAIN_DEV_SPLIT = 0.75
@ -89,6 +118,20 @@ def create_dataset_dict(dataset):
}
return dataset_dict
def generate_task_requirement(task_name, data_config):
user_requirement = get_user_requirement(task_name, data_config)
split_dataset_path = get_split_dataset_path(task_name, data_config)
train_path = split_dataset_path["train"]
dev_path = split_dataset_path["dev_wo_target"]
test_path = split_dataset_path["test_wo_target"]
work_dir = data_config["work_dir"]
output_dir = f"{work_dir}/{task_name}"
user_requirement = TASK_PROMPT.format(user_requirement=user_requirement,
train_path=train_path, dev_path=dev_path, test_path=test_path,
output_dir=output_dir)
return user_requirement
class ExpDataset:
description : str = None
metadata : dict = None

View file

@ -23,7 +23,7 @@ import random
import json
from metagpt.llm import LLM
from metagpt.schema import Message
from examples.MCTS_test.utils import load_data_config, mcts_logger
from expo.utils import load_data_config, mcts_logger
DATA_CONFIG = load_data_config()

View file

@ -3,7 +3,7 @@ import random
import json
from metagpt.llm import LLM
from metagpt.schema import Message
from examples.MCTS_test.utils import clean_json_from_rsp, load_data_config
from expo.utils import clean_json_from_rsp, load_data_config
DATA_CONFIG = load_data_config()

View file

@ -10,6 +10,9 @@ from metagpt.utils.common import write_json_file, read_json_file, format_trackba
from metagpt.const import MESSAGE_ROUTE_TO_ALL, SERDESER_PATH
from metagpt.utils.recovery_util import save_history
from expo.utils import mcts_logger, save_notebook
from pydantic import Field, model_validator
from metagpt.actions.di.write_analysis_code import CheckData, WriteAnalysisCode
import re
import os
@ -84,6 +87,17 @@ class ResearchAssistant(DataInterpreter):
json_block = CodeParser.parse_code(block=None, text=rsp)
score_dict = json.loads(json_block)
return score_dict
@model_validator(mode="after")
def set_plan_and_tool(self) -> "Interpreter":
if self.planner.plan.goal != '':
self.set_actions([WriteAnalysisCode])
self._set_state(0)
print("Plan already exists, skipping initialization.")
return self
print("Initializing plan and tool...")
return super().set_plan_and_tool()
async def _act_on_task(self, current_task: Task) -> TaskResult:
"""Useful in 'plan_and_act' mode. Wrap the output in a TaskResult for review and confirmation."""

View file

@ -1,10 +1,11 @@
import os
from metagpt.roles.di.research_assistant import ResearchAssistant
from expo.research_assistant import ResearchAssistant
import asyncio
from examples.MCTS_test.utils import DATA_CONFIG, generate_task_requirement, get_exp_pool_path
from examples.MCTS_test.insights.InsightGenerate import InsightGenerator
from examples.MCTS_test.MCTS import create_initial_state
from examples.MCTS_test.evaluation.evaluation import evaluate_score
from expo.utils import DATA_CONFIG, get_exp_pool_path
from expo.dataset import generate_task_requirement
from expo.insights.InsightGenerate import InsightGenerator
from expo.MCTS import create_initial_state
from expo.evaluation.evaluation import evaluate_score
import json
import argparse
import pandas as pd

View file

@ -1,6 +1,7 @@
from examples.MCTS_test.MCTS import MCTS, Node, initialize_di_root_node
from examples.MCTS_test.utils import load_data_config, generate_task_requirement
from examples.MCTS_test.visualize_mcts import get_tree_text
from expo.MCTS import MCTS, Node, initialize_di_root_node
from expo.utils import load_data_config
from expo.dataset import generate_task_requirement
from expo.evaluation.visualize_mcts import get_tree_text
import asyncio
import argparse

View file

@ -1,5 +1,7 @@
from expo.MCTS import MCTS, Node, initialize_di_root_node
from expo.utils import load_data_config, generate_task_requirement
from expo.utils import load_data_config
from expo.dataset import generate_task_requirement
from expo.evaluation.visualize_mcts import get_tree_text
import asyncio
import argparse

View file

@ -1,5 +1,4 @@
import yaml
from examples.MCTS_test.dataset import get_user_requirement, get_split_dataset_path
from metagpt.roles.role import Role
from metagpt.actions.di.execute_nb_code import ExecuteNbCode
from metagpt.utils.save_code import save_code_file
@ -13,34 +12,6 @@ import sys
import os
import re
TASK_PROMPT = """\
# User requirement
{user_requirement}
**Attention** Please do not leak the target label in any form during training.
## Saving Dev and Test Predictions
Save the prediction results of the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory BEFORE printig out the results.
The file should contain a single `target` column with the predicted values.
Make sure the prediction results are in the same format as the target column in the training set. The labels should be transformed back to the original format if any transformation was applied during training.
## Output Training Set Performance
Make sure the performance of the model is printed in python in the last step even if it has been printed in the previous steps. The value should be a float number.
Print the training set performance in the last step. Write in this format:
```python
...
print("Train score:", train_score)
```
# Data dir
training: {train_path}
dev: {dev_path}
testing: {test_path}
# Output dir
{output_dir}
"""
def load_data_config(file_path="data.yaml"):
with open(file_path, 'r') as stream:
data_config = yaml.safe_load(stream)
@ -78,18 +49,6 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"):
exp_pool_path = os.path.join(data_path, f"{pool_name}.json")
return exp_pool_path
def generate_task_requirement(task_name, data_config):
user_requirement = get_user_requirement(task_name, data_config)
split_dataset_path = get_split_dataset_path(task_name, data_config)
train_path = split_dataset_path["train"]
dev_path = split_dataset_path["dev_wo_target"]
test_path = split_dataset_path["test_wo_target"]
work_dir = data_config["work_dir"]
output_dir = f"{work_dir}/{task_name}"
user_requirement = TASK_PROMPT.format(user_requirement=user_requirement,
train_path=train_path, dev_path=dev_path, test_path=test_path,
output_dir=output_dir)
return user_requirement
def change_plan(role, plan):
print(f"Change next plan to: {plan}")