1. add special instruction

2. add fixed insights
This commit is contained in:
Yizhou Chi 2024-09-14 15:17:42 +08:00
parent 9c3adbe0ac
commit 8beca0fadd
11 changed files with 111 additions and 47 deletions

View file

@ -15,18 +15,18 @@ from metagpt.tools.tool_recommend import ToolRecommender
from metagpt.utils.common import read_json_file
def initialize_di_root_node(task, data_config, low_is_better=False, reflection=True, name=""):
def initialize_di_root_node(state, reflection: bool = True):
start_task_id = 2
state = create_initial_state(
task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
)
# state = create_initial_state(
# task, start_task_id=start_task_id, data_config=data_config, low_is_better=low_is_better, name=name
# )
role = ResearchAssistant(
node_id="0", start_task_id=start_task_id, use_reflection=reflection, role_dir=state["node_dir"]
)
return role, Node(parent=None, state=state, action=None, value=0)
def create_initial_state(task, start_task_id, data_config, low_is_better, name):
def create_initial_state(task, start_task_id, data_config, low_is_better: bool, name: str, special_instruction: str):
initial_state = {
"task": task,
"work_dir": data_config["work_dir"],
@ -34,7 +34,9 @@ def create_initial_state(task, start_task_id, data_config, low_is_better, name):
"dataset_config": data_config["datasets"][task],
"datasets_dir": get_split_dataset_path(task, data_config),
"exp_pool_path": get_exp_pool_path(task, data_config, pool_name="ds_analysis_pool"),
"requirement": generate_task_requirement(task, data_config),
"requirement": generate_task_requirement(
task, data_config, is_di=True, special_instruction=special_instruction
),
"has_run": False,
"start_task_id": start_task_id,
"low_is_better": low_is_better,
@ -157,6 +159,7 @@ class Node:
original_instruction=original_instruction,
max_num=max_children,
file_path=self.state["exp_pool_path"],
use_fixed_insights=self.use_fixed_insights,
)
new_state = self.state.copy()
new_state["start_task_id"] += 1
@ -234,9 +237,10 @@ class MCTS:
c_explore: float = 1.4
c_unvisited: float = 0.8
def __init__(self, root_node, max_depth):
def __init__(self, root_node, max_depth, use_fixed_insights):
self.root_node = root_node
self.max_depth = max_depth
self.use_fixed_insights = use_fixed_insights
def select(self, node: Node):
node = self.best_child()
@ -303,10 +307,8 @@ class MCTS:
def get_num_simulations(self):
return self.root_node.visited
async def search(self, task, data_config, name, rollouts, load_tree=False, low_is_better=False, reflection=False):
role, root = initialize_di_root_node(
task, data_config, low_is_better=low_is_better, reflection=reflection, name=name
)
async def search(self, state, rollouts, load_tree=False, reflection=False):
role, root = initialize_di_root_node(state, reflection=reflection)
self.root_node = root
tree_loaded = False
if load_tree:

View file

@ -187,16 +187,10 @@ ### Base DI
For setup, check 5.
- `python run_experiment.py --exp_mode base --task titanic --num_experiments 10`
- Ask DI to use AutoGluon: `--special_instruction ag`
- Ask DI to use the stacking ensemble method: `--special_instruction stacking`
### DI RandomSearch
For setup, check 5.
- Single insight
`python run_experiment.py --exp_mode aug --task titanic --aug_mode single`
- Set insight
`python run_experiment.py --exp_mode aug --task titanic --aug_mode set`
## 5. DI MCTS
@ -223,6 +217,20 @@ #### Run
- `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 10 --low_is_better`
In addition to the generated insights, include the fixed insights saved in `insights/fixed_insights.json`
- `--use_fixed_insights`
#### Ablation Study
**DI RandomSearch**
- Single insight
`python run_experiment.py --exp_mode aug --task titanic --aug_mode single`
- Set insight
`python run_experiment.py --exp_mode aug --task titanic --aug_mode set`

View file

@ -10,16 +10,27 @@ from sklearn.model_selection import train_test_split
from expo.insights.solution_designer import SolutionDesigner
BASE_USER_REQUIREMENT = """\
BASE_USER_REQUIREMENT = """
This is a {datasetname} dataset. Your goal is to predict the target column `{target_col}`.
Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target.
Report {metric} on the eval data. Do not plot or make any visualizations.
"""
RECOMMENDATION = """\
USE_AG = """
7. Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.
"""
STACKING = """
7. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor.
8. You could do some quick model prototyping to see which models work best and then use them in the ensemble.
"""
SPECIAL_INSTRUCTIONS = {"ag": USE_AG, "stacking": STACKING}
RECOMMENDATION = """
## Base Models and Ensemble
You can consider using the following base models:
GBM (LightGBM) CAT (CatBoost) XGB (XGBoost) RF (random forest) XT (extremely randomized trees) KNN (k-nearest neighbors) LR (linear regression)
`GBM` (LightGBM) `CAT` (CatBoost) `XGB` (XGBoost) `RF` (random forest) `XT` (extremely randomized trees) `KNN` (k-nearest neighbors) LR (linear regression)
"""
DI_INSTRUCTION = (
@ -27,11 +38,10 @@ DI_INSTRUCTION = (
+ """**Attention**
1. Please do not leak the target label in any form during training.
2. Test set does not have the target column.
3. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
4. When scaling or transforming features, make sure the target column is not included.
5. You could utilize dev set to validate and improve model training.
6. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained
7. Make sure the model prototyping is fast.
3. When conducting data exploration or analysis, print out the results of your findings.
4. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
5. When scaling or transforming features, make sure the target column is not included.
6. You could utilize dev set to validate and improve model training. {special_instruction}
## Saving Dev and Test Predictions
1. Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory.
@ -46,7 +56,7 @@ Print the train and dev set performance in the last step.
"""
)
TASK_PROMPT = """\
TASK_PROMPT = """
# User requirement
{user_requirement}
{additional_instruction}
@ -142,12 +152,18 @@ def create_dataset_dict(dataset):
return dataset_dict
def generate_di_instruction(output_dir):
additional_instruction = DI_INSTRUCTION.format(output_dir=output_dir)
def generate_di_instruction(output_dir, special_instruction):
if special_instruction:
special_instruction_prompt = SPECIAL_INSTRUCTIONS[special_instruction]
else:
special_instruction_prompt = ""
additional_instruction = DI_INSTRUCTION.format(
output_dir=output_dir, special_instruction=special_instruction_prompt
)
return additional_instruction
def generate_task_requirement(task_name, data_config, is_di=True):
def generate_task_requirement(task_name, data_config, is_di=True, special_instruction=None):
user_requirement = get_user_requirement(task_name, data_config)
split_dataset_path = get_split_dataset_path(task_name, data_config)
train_path = split_dataset_path["train"]
@ -158,7 +174,7 @@ def generate_task_requirement(task_name, data_config, is_di=True):
datasets_dir = data_config["datasets_dir"]
data_info_path = f"{datasets_dir}/{task_name}/dataset_info.json"
if is_di:
additional_instruction = generate_di_instruction(output_dir)
additional_instruction = generate_di_instruction(output_dir, special_instruction)
else:
additional_instruction = ""
user_requirement = TASK_PROMPT.format(

View file

@ -17,7 +17,9 @@ class AugExperimenter(Experimenter):
# state = create_initial_state(self.args.task, start_task_id=1, data_config=self.data_config, low_is_better=self.args.low_is_better, name="")
user_requirement = self.state["requirement"]
exp_pool_path = get_exp_pool_path(self.args.task, self.data_config, pool_name="ds_analysis_pool")
exp_pool = InstructionGenerator.load_analysis_pool(exp_pool_path)
exp_pool = InstructionGenerator.load_analysis_pool(
exp_pool_path, use_fixed_insights=self.args.use_fixed_insights
)
if self.args.aug_mode == "single":
exps = InstructionGenerator._random_sample(exp_pool, self.args.num_experiments)
exps = [exp["Analysis"] for exp in exps]

View file

@ -18,7 +18,12 @@ class CustomExperimenter(Experimenter):
self.name = kwargs.get("name", "")
self.result_path = f"results/custom_{self.name}"
self.state = create_initial_state(
self.task, start_task_id=1, data_config=self.data_config, low_is_better=self.low_is_better, name=self.name
self.task,
start_task_id=1,
data_config=self.data_config,
low_is_better=self.low_is_better,
name=self.name,
special_instruction=self.args.special_instruction,
)
def run_experiment(self):

View file

@ -23,7 +23,8 @@ class Experimenter:
start_task_id=1,
data_config=self.data_config,
low_is_better=self.args.low_is_better,
name="",
name=self.args.name,
special_instruction=self.args.special_instruction,
)
async def run_di(self, di, user_requirement, run_idx):

View file

@ -13,19 +13,15 @@ class MCTSExperimenter(Experimenter):
async def run_experiment(self):
if self.tree_mode == "greedy":
mcts = Greedy(root_node=None, max_depth=5)
mcts = Greedy(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
elif self.tree_mode == "random":
mcts = Random(root_node=None, max_depth=5)
mcts = Random(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
else:
mcts = MCTS(root_node=None, max_depth=5)
mcts = MCTS(root_node=None, max_depth=5, use_fixed_insights=self.args.use_fixed_insights)
best_nodes = await mcts.search(
self.args.task,
self.data_config,
low_is_better=self.args.low_is_better,
load_tree=self.args.load_tree,
state=self.state,
reflection=self.args.reflection,
rollouts=self.args.rollouts,
name=self.args.name,
)
best_node = best_nodes["global_best"]
dev_best_node = best_nodes["dev_best"]

View file

@ -0,0 +1,22 @@
[
{
"Analysis": "Use early stopping, hyperparameter tuning, and cross-validation to avoid overfitting and improve robustness of the model.",
"Category": "Model Training",
"task_id": 4
},
{
"Analysis": "use k-fold bagging and early stopping",
"Category": "Model Training",
"task_id": 4
},
{
"Analysis": "To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor using **dev set** after base models being trained.",
"Category": "Model Training",
"task_id": 4
},
{
"Analysis": "Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric.",
"Category": "Model Training",
"task_id": 4
}
]

View file

@ -1,4 +1,5 @@
import json
import os
import random
from expo.utils import clean_json_from_rsp, load_data_config, mcts_logger
@ -68,8 +69,12 @@ class InstructionGenerator:
return new_data
@staticmethod
def load_analysis_pool(file_path, task_id=None):
def load_analysis_pool(file_path, use_fixed_insights, task_id=None):
data = InstructionGenerator.load_json_data(file_path)
if use_fixed_insights:
current_directory = os.path.dirname(__file__)
fixed_insights = InstructionGenerator.load_json_data(f"{current_directory}/fixed_insights.json")
data.extend(fixed_insights)
for item in data:
if "task_id" not in item:
raise ValueError("task_id is not found in the analysis pool")
@ -79,8 +84,12 @@ class InstructionGenerator:
return data
@staticmethod
async def generate_new_instructions(task_id, original_instruction, max_num, file_path, ext_info=None):
data = InstructionGenerator.load_analysis_pool(file_path, task_id)
async def generate_new_instructions(
task_id, original_instruction, max_num, file_path, ext_info=None, use_fixed_insights=False
):
data = InstructionGenerator.load_analysis_pool(
file_path, task_id=task_id, use_fixed_insights=use_fixed_insights
)
new_instructions = []
if len(data) == 0:
mcts_logger.log("MCTS", f"No insights available for task {task_id}")

View file

@ -3,3 +3,4 @@ openml==0.14.2
# ml module to run in DI
xgboost
catboost
lightgbm

View file

@ -28,11 +28,11 @@ def get_mcts_args(parser):
parser.add_argument("--no_load_tree", dest="load_tree", action="store_false")
parser.set_defaults(load_tree=False)
parser.add_argument("--rollouts", type=int, default=5)
parser.add_argument("--use_fixed_insights", dest="use_fixed_insights", action="store_true")
def get_aug_exp_args(parser):
parser.add_argument("--aug_mode", type=str, default="single", choices=["single", "set"])
parser.add_argument("--num_experiments", type=int, default=1)
def get_di_args(parser):
@ -41,6 +41,8 @@ def get_di_args(parser):
parser.set_defaults(low_is_better=False)
parser.add_argument("--reflection", dest="reflection", action="store_true")
parser.add_argument("--no_reflection", dest="reflection", action="store_false")
parser.add_argument("--num_experiments", type=int, default=1)
parser.add_argument("--special_instruction", type=str, default=None, choices=["ag", "stacking"])
parser.set_defaults(reflection=True)