From c0262bcd8f1c8803da18d5a0c80bc0094e168ed2 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Fri, 6 Sep 2024 19:05:10 +0800
Subject: [PATCH] 1. add support to hf dataset 2. add support to datasets that
 have both train and test 3. create data folder 4. fix new instruction bug

---
 expo/MCTS.py                           |  2 +-
 expo/{ => data}/dataset.py             | 50 +++++++++++---------
 expo/data/hf_data.py                   | 64 ++++++++++++++++++++++++++
 expo/experimenter/mcts.py              |  4 +-
 expo/insights/instruction_generator.py |  2 +-
 5 files changed, 97 insertions(+), 25 deletions(-)
 rename expo/{ => data}/dataset.py (87%)
 create mode 100644 expo/data/hf_data.py

diff --git a/expo/MCTS.py b/expo/MCTS.py
index 3331f35fa..4090331cd 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -6,7 +6,7 @@ import random
 import numpy as np
 import pandas as pd
 
-from expo.dataset import generate_task_requirement, get_split_dataset_path
+from expo.data.dataset import generate_task_requirement, get_split_dataset_path
 from expo.evaluation.evaluation import evaluate_score
 from expo.insights.instruction_generator import InstructionGenerator
 from expo.research_assistant import ResearchAssistant
diff --git a/expo/dataset.py b/expo/data/dataset.py
similarity index 87%
rename from expo/dataset.py
rename to expo/data/dataset.py
index f7e0301b5..21dc19519 100644
--- a/expo/dataset.py
+++ b/expo/data/dataset.py
@@ -86,6 +86,8 @@ CUSTOM_DATASETS = [
     ("07_icr-identify-age-related-conditions", "Class"),
 ]
 
+DSAGENT_DATASETS = [("concrete-strength", "Strength"), ("smoker-status", "smoking"), ("software-defects", "defects")]
+
 
 def get_split_dataset_path(dataset_name, config):
     datasets_dir = config["datasets_dir"]
@@ -121,8 +123,8 @@ def get_user_requirement(task_name, config):
         )
 
 
-def save_datasets_dict_to_yaml(datasets_dict):
-    with open("datasets.yaml", "w") as file:
+def save_datasets_dict_to_yaml(datasets_dict, name="datasets.yaml"):
+    with open(name, "w") as file:
         yaml.dump(datasets_dict, file)
 
 
@@ -201,11 +203,15 @@ class ExpDataset:
 
     def get_raw_dataset(self):
         raw_dir = Path(self.dataset_dir, self.name, "raw")
+        train_df = None
+        test_df = None
         if not os.path.exists(Path(raw_dir, "train.csv")):
             raise FileNotFoundError(f"Raw dataset `train.csv` not found in {raw_dir}")
         else:
-            df = pd.read_csv(Path(raw_dir, "train.csv"))
-            return df
+            train_df = pd.read_csv(Path(raw_dir, "train.csv"))
+        if os.path.exists(Path(raw_dir, "test.csv")):
+            test_df = pd.read_csv(Path(raw_dir, "test.csv"))
+        return train_df, test_df
 
     def get_dataset_info(self):
         raw_df = pd.read_csv(Path(self.dataset_dir, self.name, "raw", "train.csv"))
@@ -249,10 +255,10 @@ class ExpDataset:
         return req
 
     def save_dataset(self, target_col):
-        df = self.get_raw_dataset()
+        df, test_df = self.get_raw_dataset()
         if not self.check_dataset_exists() or self.force_update:
             print(f"Saving Dataset {self.name} in {self.dataset_dir}")
-            self.split_and_save(df, target_col)
+            self.split_and_save(df, target_col, test_df=test_df)
         else:
             print(f"Dataset {self.name} already exists")
         if not self.check_datasetinfo_exists() or self.force_update:
@@ -278,10 +284,13 @@ class ExpDataset:
                 df_target = df_target.drop(columns=[target_col])
             df_target.to_csv(Path(path, f"split_{split}_target.csv"), index=False)
 
-    def split_and_save(self, df, target_col):
+    def split_and_save(self, df, target_col, test_df=None):
         if not target_col:
             raise ValueError("Target column not provided")
-        train, test = train_test_split(df, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED)
+        if test_df is None:
+            train, test = train_test_split(df, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED)
+        else:
+            train = df
         train, dev = train_test_split(train, test_size=1 - TRAIN_DEV_SPLIT, random_state=SEED)
         self.save_split_datasets(train, "train")
         self.save_split_datasets(dev, "dev", target_col)
@@ -304,7 +313,7 @@ class OpenMLExpDataset(ExpDataset):
         raw_dir = Path(self.dataset_dir, self.name, "raw")
         os.makedirs(raw_dir, exist_ok=True)
         dataset_df.to_csv(Path(raw_dir, "train.csv"), index=False)
-        return dataset_df
+        return dataset_df, None
 
     def get_dataset_info(self):
         dataset_info = super().get_dataset_info()
@@ -315,14 +324,9 @@ class OpenMLExpDataset(ExpDataset):
         return dataset_info
 
 
-# class HFExpDataset(ExpDataset):
-#     def __init__(self, name, dataset_dir, dataset_name, **kwargs):
-#         super().__init__(name, dataset_dir, **kwargs)
-
-
-async def process_dataset(dataset, solution_designer, save_analysis_pool, datasets_dict):
+async def process_dataset(dataset, solution_designer: SolutionDesigner, save_analysis_pool, datasets_dict):
     if save_analysis_pool:
-        asyncio.run(solution_designer.generate_solutions(dataset.get_dataset_info(), dataset.name))
+        await solution_designer.generate_solutions(dataset.get_dataset_info(), dataset.name)
     dataset_dict = create_dataset_dict(dataset)
     datasets_dict["datasets"][dataset.name] = dataset_dict
 
@@ -330,14 +334,18 @@ async def process_dataset(dataset, solution_designer, save_analysis_pool, datase
 if __name__ == "__main__":
     datasets_dir = "D:/work/automl/datasets"
     force_update = False
-    save_analysis_pool = False
+    save_analysis_pool = True
     datasets_dict = {"datasets": {}}
     solution_designer = SolutionDesigner()
-    for dataset_id in OPENML_DATASET_IDS:
-        openml_dataset = OpenMLExpDataset("", datasets_dir, dataset_id, force_update=force_update)
-        asyncio.run(process_dataset(openml_dataset, solution_designer, save_analysis_pool, datasets_dict))
+    # for dataset_id in OPENML_DATASET_IDS:
+    #     openml_dataset = OpenMLExpDataset("", datasets_dir, dataset_id, force_update=force_update)
+    #     asyncio.run(process_dataset(openml_dataset, solution_designer, save_analysis_pool, datasets_dict))
 
-    for dataset_name, target_col in CUSTOM_DATASETS:
+    # for dataset_name, target_col in CUSTOM_DATASETS:
+    #     custom_dataset = ExpDataset(dataset_name, datasets_dir, target_col=target_col, force_update=force_update)
+    #     asyncio.run(process_dataset(custom_dataset, solution_designer, save_analysis_pool, datasets_dict))
+
+    for dataset_name, target_col in DSAGENT_DATASETS:
         custom_dataset = ExpDataset(dataset_name, datasets_dir, target_col=target_col, force_update=force_update)
         asyncio.run(process_dataset(custom_dataset, solution_designer, save_analysis_pool, datasets_dict))
 
diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py
new file mode 100644
index 000000000..a7e2a1afe
--- /dev/null
+++ b/expo/data/hf_data.py
@@ -0,0 +1,64 @@
+import asyncio
+import os
+from pathlib import Path
+
+import pandas as pd
+from datasets import load_dataset
+
+from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml
+from expo.insights.solution_designer import SolutionDesigner
+
+HFDATSETS = [
+    {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label"},
+    {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label"},
+    {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label"},
+    {"name": "oxford-iiit-pet", "dataset_name": "timm/oxford-iiit-pet", "target_col": "label"},
+    {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label"},
+    {"name": "fashion_mnist", "dataset_name": "zalando-datasets/fashion_mnist", "target_col": "label"},
+]
+
+
+class HFExpDataset(ExpDataset):
+    train_ratio = 0.6
+    dev_ratio = 0.2
+    test_ratio = 0.2
+
+    def __init__(self, name, dataset_dir, dataset_name, **kwargs):
+        self.name = name
+        self.dataset_dir = dataset_dir
+        self.dataset_name = dataset_name
+        self.target_col = kwargs.get("target_col", "label")
+        self.dataset = load_dataset(dataset_name)
+        super().__init__(self.name, dataset_dir, **kwargs)
+
+    def get_raw_dataset(self):
+        raw_dir = Path(self.dataset_dir, self.name, "raw")
+        raw_dir.mkdir(parents=True, exist_ok=True)
+        if os.path.exists(Path(raw_dir, "train.csv")):
+            df = pd.read_csv(Path(raw_dir, "train.csv"))
+        else:
+            df = self.dataset["train"].to_pandas()
+            df.to_csv(Path(raw_dir, "train.csv"))
+
+        if os.path.exists(Path(raw_dir, "test.csv")):
+            test_df = pd.read_csv(Path(raw_dir, "test.csv"))
+        else:
+            if "test" in self.dataset:
+                test_df = self.dataset["test"].to_pandas()
+                test_df.to_csv(Path(raw_dir, "test.csv"))
+            else:
+                test_df = None
+        return df, test_df
+
+
+if __name__ == "__main__":
+    dataset_dir = "D:/work/automl/datasets"
+    save_analysis_pool = True
+    datasets_dict = {"datasets": {}}
+    solution_designer = SolutionDesigner()
+    for dataset_meta in HFDATSETS:
+        hf_dataset = HFExpDataset(
+            dataset_meta["name"], dataset_dir, dataset_meta["dataset_name"], target_col=dataset_meta["target_col"]
+        )
+        asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict))
+    save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml")
diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py
index 2805cae51..9db6e0807 100644
--- a/expo/experimenter/mcts.py
+++ b/expo/experimenter/mcts.py
@@ -22,8 +22,8 @@ class MCTSExperimenter(Experimenter):
 
         text, num_generated_codes = get_tree_text(mcts.root_node)
         text += f"Generated {num_generated_codes} unique codes.\n"
-        text += f"Best node: {best_node}, score: {best_node.raw_reward}\n"
-        text += f"Dev best node: {dev_best_node}, score: {dev_best_node.raw_reward}\n"
+        text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
+        text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
         print(text)
         self.save_tree(text)
 
diff --git a/expo/insights/instruction_generator.py b/expo/insights/instruction_generator.py
index 2cfee3107..c9ff7ec6e 100644
--- a/expo/insights/instruction_generator.py
+++ b/expo/insights/instruction_generator.py
@@ -84,7 +84,7 @@ class InstructionGenerator:
         new_instructions = []
         if len(data) == 0:
             mcts_logger.log("MCTS", f"No insights available for task {task_id}")
-            return [original_instruction]  # Return the original instruction if no insights are available
+            # return [original_instruction]  # Return the original instruction if no insights are available
         for i in range(max_num):
             if len(data) == 0:
                 insights = "No insights available"