From c0262bcd8f1c8803da18d5a0c80bc0094e168ed2 Mon Sep 17 00:00:00 2001 From: Yizhou Chi Date: Fri, 6 Sep 2024 19:05:10 +0800 Subject: [PATCH] 1. add support to hf dataset 2. add support to datasets that have both train and test 3. create data folder 4. fix new instruction bug --- expo/MCTS.py | 2 +- expo/{ => data}/dataset.py | 50 +++++++++++--------- expo/data/hf_data.py | 64 ++++++++++++++++++++++++++ expo/experimenter/mcts.py | 4 +- expo/insights/instruction_generator.py | 2 +- 5 files changed, 97 insertions(+), 25 deletions(-) rename expo/{ => data}/dataset.py (87%) create mode 100644 expo/data/hf_data.py diff --git a/expo/MCTS.py b/expo/MCTS.py index 3331f35fa..4090331cd 100644 --- a/expo/MCTS.py +++ b/expo/MCTS.py @@ -6,7 +6,7 @@ import random import numpy as np import pandas as pd -from expo.dataset import generate_task_requirement, get_split_dataset_path +from expo.data.dataset import generate_task_requirement, get_split_dataset_path from expo.evaluation.evaluation import evaluate_score from expo.insights.instruction_generator import InstructionGenerator from expo.research_assistant import ResearchAssistant diff --git a/expo/dataset.py b/expo/data/dataset.py similarity index 87% rename from expo/dataset.py rename to expo/data/dataset.py index f7e0301b5..21dc19519 100644 --- a/expo/dataset.py +++ b/expo/data/dataset.py @@ -86,6 +86,8 @@ CUSTOM_DATASETS = [ ("07_icr-identify-age-related-conditions", "Class"), ] +DSAGENT_DATASETS = [("concrete-strength", "Strength"), ("smoker-status", "smoking"), ("software-defects", "defects")] + def get_split_dataset_path(dataset_name, config): datasets_dir = config["datasets_dir"] @@ -121,8 +123,8 @@ def get_user_requirement(task_name, config): ) -def save_datasets_dict_to_yaml(datasets_dict): - with open("datasets.yaml", "w") as file: +def save_datasets_dict_to_yaml(datasets_dict, name="datasets.yaml"): + with open(name, "w") as file: yaml.dump(datasets_dict, file) @@ -201,11 +203,15 @@ class ExpDataset: def get_raw_dataset(self): raw_dir = Path(self.dataset_dir, self.name, "raw") + train_df = None + test_df = None if not os.path.exists(Path(raw_dir, "train.csv")): raise FileNotFoundError(f"Raw dataset `train.csv` not found in {raw_dir}") else: - df = pd.read_csv(Path(raw_dir, "train.csv")) - return df + train_df = pd.read_csv(Path(raw_dir, "train.csv")) + if os.path.exists(Path(raw_dir, "test.csv")): + test_df = pd.read_csv(Path(raw_dir, "test.csv")) + return train_df, test_df def get_dataset_info(self): raw_df = pd.read_csv(Path(self.dataset_dir, self.name, "raw", "train.csv")) @@ -249,10 +255,10 @@ class ExpDataset: return req def save_dataset(self, target_col): - df = self.get_raw_dataset() + df, test_df = self.get_raw_dataset() if not self.check_dataset_exists() or self.force_update: print(f"Saving Dataset {self.name} in {self.dataset_dir}") - self.split_and_save(df, target_col) + self.split_and_save(df, target_col, test_df=test_df) else: print(f"Dataset {self.name} already exists") if not self.check_datasetinfo_exists() or self.force_update: @@ -278,10 +284,13 @@ class ExpDataset: df_target = df_target.drop(columns=[target_col]) df_target.to_csv(Path(path, f"split_{split}_target.csv"), index=False) - def split_and_save(self, df, target_col): + def split_and_save(self, df, target_col, test_df=None): if not target_col: raise ValueError("Target column not provided") - train, test = train_test_split(df, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED) + if test_df is None: + train, test = train_test_split(df, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED) + else: + train = df train, dev = train_test_split(train, test_size=1 - TRAIN_DEV_SPLIT, random_state=SEED) self.save_split_datasets(train, "train") self.save_split_datasets(dev, "dev", target_col) @@ -304,7 +313,7 @@ class OpenMLExpDataset(ExpDataset): raw_dir = Path(self.dataset_dir, self.name, "raw") os.makedirs(raw_dir, exist_ok=True) dataset_df.to_csv(Path(raw_dir, "train.csv"), index=False) - return dataset_df + return dataset_df, None def get_dataset_info(self): dataset_info = super().get_dataset_info() @@ -315,14 +324,9 @@ class OpenMLExpDataset(ExpDataset): return dataset_info -# class HFExpDataset(ExpDataset): -# def __init__(self, name, dataset_dir, dataset_name, **kwargs): -# super().__init__(name, dataset_dir, **kwargs) - - -async def process_dataset(dataset, solution_designer, save_analysis_pool, datasets_dict): +async def process_dataset(dataset, solution_designer: SolutionDesigner, save_analysis_pool, datasets_dict): if save_analysis_pool: - asyncio.run(solution_designer.generate_solutions(dataset.get_dataset_info(), dataset.name)) + await solution_designer.generate_solutions(dataset.get_dataset_info(), dataset.name) dataset_dict = create_dataset_dict(dataset) datasets_dict["datasets"][dataset.name] = dataset_dict @@ -330,14 +334,18 @@ async def process_dataset(dataset, solution_designer, save_analysis_pool, datase if __name__ == "__main__": datasets_dir = "D:/work/automl/datasets" force_update = False - save_analysis_pool = False + save_analysis_pool = True datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() - for dataset_id in OPENML_DATASET_IDS: - openml_dataset = OpenMLExpDataset("", datasets_dir, dataset_id, force_update=force_update) - asyncio.run(process_dataset(openml_dataset, solution_designer, save_analysis_pool, datasets_dict)) + # for dataset_id in OPENML_DATASET_IDS: + # openml_dataset = OpenMLExpDataset("", datasets_dir, dataset_id, force_update=force_update) + # asyncio.run(process_dataset(openml_dataset, solution_designer, save_analysis_pool, datasets_dict)) - for dataset_name, target_col in CUSTOM_DATASETS: + # for dataset_name, target_col in CUSTOM_DATASETS: + # custom_dataset = ExpDataset(dataset_name, datasets_dir, target_col=target_col, force_update=force_update) + # asyncio.run(process_dataset(custom_dataset, solution_designer, save_analysis_pool, datasets_dict)) + + for dataset_name, target_col in DSAGENT_DATASETS: custom_dataset = ExpDataset(dataset_name, datasets_dir, target_col=target_col, force_update=force_update) asyncio.run(process_dataset(custom_dataset, solution_designer, save_analysis_pool, datasets_dict)) diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py new file mode 100644 index 000000000..a7e2a1afe --- /dev/null +++ b/expo/data/hf_data.py @@ -0,0 +1,64 @@ +import asyncio +import os +from pathlib import Path + +import pandas as pd +from datasets import load_dataset + +from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml +from expo.insights.solution_designer import SolutionDesigner + +HFDATSETS = [ + {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label"}, + {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label"}, + {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label"}, + {"name": "oxford-iiit-pet", "dataset_name": "timm/oxford-iiit-pet", "target_col": "label"}, + {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label"}, + {"name": "fashion_mnist", "dataset_name": "zalando-datasets/fashion_mnist", "target_col": "label"}, +] + + +class HFExpDataset(ExpDataset): + train_ratio = 0.6 + dev_ratio = 0.2 + test_ratio = 0.2 + + def __init__(self, name, dataset_dir, dataset_name, **kwargs): + self.name = name + self.dataset_dir = dataset_dir + self.dataset_name = dataset_name + self.target_col = kwargs.get("target_col", "label") + self.dataset = load_dataset(dataset_name) + super().__init__(self.name, dataset_dir, **kwargs) + + def get_raw_dataset(self): + raw_dir = Path(self.dataset_dir, self.name, "raw") + raw_dir.mkdir(parents=True, exist_ok=True) + if os.path.exists(Path(raw_dir, "train.csv")): + df = pd.read_csv(Path(raw_dir, "train.csv")) + else: + df = self.dataset["train"].to_pandas() + df.to_csv(Path(raw_dir, "train.csv")) + + if os.path.exists(Path(raw_dir, "test.csv")): + test_df = pd.read_csv(Path(raw_dir, "test.csv")) + else: + if "test" in self.dataset: + test_df = self.dataset["test"].to_pandas() + test_df.to_csv(Path(raw_dir, "test.csv")) + else: + test_df = None + return df, test_df + + +if __name__ == "__main__": + dataset_dir = "D:/work/automl/datasets" + save_analysis_pool = True + datasets_dict = {"datasets": {}} + solution_designer = SolutionDesigner() + for dataset_meta in HFDATSETS: + hf_dataset = HFExpDataset( + dataset_meta["name"], dataset_dir, dataset_meta["dataset_name"], target_col=dataset_meta["target_col"] + ) + asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict)) + save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml") diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py index 2805cae51..9db6e0807 100644 --- a/expo/experimenter/mcts.py +++ b/expo/experimenter/mcts.py @@ -22,8 +22,8 @@ class MCTSExperimenter(Experimenter): text, num_generated_codes = get_tree_text(mcts.root_node) text += f"Generated {num_generated_codes} unique codes.\n" - text += f"Best node: {best_node}, score: {best_node.raw_reward}\n" - text += f"Dev best node: {dev_best_node}, score: {dev_best_node.raw_reward}\n" + text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n" + text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n" print(text) self.save_tree(text) diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py index 2cfee3107..c9ff7ec6e 100644 --- a/expo/insights/instruction_generator.py +++ b/expo/insights/instruction_generator.py @@ -84,7 +84,7 @@ class InstructionGenerator: new_instructions = [] if len(data) == 0: mcts_logger.log("MCTS", f"No insights available for task {task_id}") - return [original_instruction] # Return the original instruction if no insights are available + # return [original_instruction] # Return the original instruction if no insights are available for i in range(max_num): if len(data) == 0: insights = "No insights available"