From 562af8c4e4b2a54275d0501420d4e9b4cd09a612 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Fri, 13 Sep 2024 11:09:02 +0800
Subject: [PATCH 1/5] update text dataset

---
 expo/data/dataset.py |  5 +--
 expo/data/hf_data.py | 61 ++++++++++++++++++++++++++----------
 expo/datasets.yaml   | 73 +++++++++++++++++++++++++++++---------------
 3 files changed, 96 insertions(+), 43 deletions(-)

diff --git a/expo/data/dataset.py b/expo/data/dataset.py
index 1494eb267..3b2017d1a 100644
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@@ -268,8 +268,9 @@ class ExpDataset:
             print(f"Dataset info for {self.name} already exists")
 
     def save_datasetinfo(self, dataset_info):
-        with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w") as file:
-            json.dump(dataset_info, file, indent=4)
+        with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w", encoding="utf-8") as file:
+            # utf-8 encoding is required
+            json.dump(dataset_info, file, indent=4, ensure_ascii=False)
 
     def save_split_datasets(self, df, split, target_col=None):
         path = Path(self.dataset_dir, self.name)
diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py
index 9ed2b2c48..6f1f55f6d 100644
--- a/expo/data/hf_data.py
+++ b/expo/data/hf_data.py
@@ -9,12 +9,22 @@ from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to
 from expo.insights.solution_designer import SolutionDesigner
 
 HFDATSETS = [
-    {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label"},
-    {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label"},
-    {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label"},
-    {"name": "oxford-iiit-pet", "dataset_name": "timm/oxford-iiit-pet", "target_col": "label"},
-    {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label"},
-    {"name": "fashion_mnist", "dataset_name": "zalando-datasets/fashion_mnist", "target_col": "label"},
+    # {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"},
+    # {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"},
+    # {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"},
+    {
+        "name": "oxford-iiit-pet",
+        "dataset_name": "timm/oxford-iiit-pet",
+        "target_col": "label_cat_dog",
+        "modality": "image",
+    },
+    {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label", "modality": "image"},
+    {
+        "name": "fashion_mnist",
+        "dataset_name": "zalando-datasets/fashion_mnist",
+        "target_col": "label",
+        "modality": "image",
+    },
 ]
 
 
@@ -27,41 +37,60 @@ class HFExpDataset(ExpDataset):
         self.name = name
         self.dataset_dir = dataset_dir
         self.dataset_name = dataset_name
+        self.modality = kwargs.get("modality", "")
         self.target_col = kwargs.get("target_col", "label")
-        self.dataset = load_dataset(dataset_name)
+        self.dataset = load_dataset(self.dataset_name, trust_remote_code=True)
         super().__init__(self.name, dataset_dir, **kwargs)
 
     def get_raw_dataset(self):
         raw_dir = Path(self.dataset_dir, self.name, "raw")
         raw_dir.mkdir(parents=True, exist_ok=True)
         if os.path.exists(Path(raw_dir, "train.csv")):
-            df = pd.read_csv(Path(raw_dir, "train.csv"))
+            df = pd.read_csv(Path(raw_dir, "train.csv"), encoding="utf-8")
         else:
             df = self.dataset["train"].to_pandas()
-            df.to_csv(Path(raw_dir, "train.csv"), index=False)
+            df.to_csv(Path(raw_dir, "train.csv"), index=False, encoding="utf-8")
 
         if os.path.exists(Path(raw_dir, "test.csv")):
-            test_df = pd.read_csv(Path(raw_dir, "test.csv"), index=False)
+            test_df = pd.read_csv(Path(raw_dir, "test.csv"), encoding="utf-8")
         else:
-            if "test" in self.dataset:
+            if self.dataset and "test" in self.dataset:
                 test_df = self.dataset["test"].to_pandas()
-                test_df.to_csv(Path(raw_dir, "test.csv"), index=False)
+                test_df.to_csv(Path(raw_dir, "test.csv"), index=False, encoding="utf-8")
             else:
                 test_df = None
         return df, test_df
 
-    # def get_df_head(self, raw_df):
-    #     return raw_df.head()
+    def get_df_head(self, raw_df):
+        if self.modality == "text":
+            examples = []
+            for i in range(5):
+                examples.append(raw_df.iloc[i].to_dict())
+            return examples
+        elif self.modality == "image":
+            return ""
+
+    def get_dataset_info(self):
+        dataset_info = super().get_dataset_info()
+        dataset = self.dataset
+        dataset_info["description"] = dataset["train"].info.description
+        return dataset_info
 
 
 if __name__ == "__main__":
     dataset_dir = "D:/work/automl/datasets"
-    save_analysis_pool = True
+    save_analysis_pool = False
+    force_update = False
     datasets_dict = {"datasets": {}}
     solution_designer = SolutionDesigner()
     for dataset_meta in HFDATSETS:
         hf_dataset = HFExpDataset(
-            dataset_meta["name"], dataset_dir, dataset_meta["dataset_name"], target_col=dataset_meta["target_col"]
+            dataset_meta["name"],
+            dataset_dir,
+            dataset_meta["dataset_name"],
+            target_col=dataset_meta["target_col"],
+            force_update=force_update,
+            modality=dataset_meta["modality"],
         )
         asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict))
     save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml")
diff --git a/expo/datasets.yaml b/expo/datasets.yaml
index 512cbc292..051e8232d 100644
--- a/expo/datasets.yaml
+++ b/expo/datasets.yaml
@@ -79,6 +79,14 @@ datasets:
       \ column `percent_pell_grant`.\nPerform data analysis, data preprocessing, feature\
       \ engineering, and modeling to predict the target. \nReport rmse on the eval\
       \ data. Do not plot or make any visualizations.\n"
+  concrete-strength:
+    dataset: concrete-strength
+    metric: rmse
+    target_col: Strength
+    user_requirement: "This is a concrete-strength dataset. Your goal is to predict\
+      \ the target column `Strength`.\nPerform data analysis, data preprocessing,\
+      \ feature engineering, and modeling to predict the target. \nReport rmse on\
+      \ the eval data. Do not plot or make any visualizations.\n"
   credit-g:
     dataset: credit-g
     metric: f1
@@ -135,30 +143,6 @@ datasets:
       \ column `class`.\nPerform data analysis, data preprocessing, feature engineering,\
       \ and modeling to predict the target. \nReport f1 weighted on the eval data.\
       \ Do not plot or make any visualizations.\n"
-  steel-plates-fault:
-    dataset: steel-plates-fault
-    metric: f1 weighted
-    target_col: target
-    user_requirement: "This is a steel-plates-fault dataset. Your goal is to predict\
-      \ the target column `target`.\nPerform data analysis, data preprocessing, feature\
-      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
-      \ eval data. Do not plot or make any visualizations.\n"
-  wine-quality-white:
-    dataset: wine-quality-white
-    metric: f1 weighted
-    target_col: Class
-    user_requirement: "This is a wine-quality-white dataset. Your goal is to predict\
-      \ the target column `Class`.\nPerform data analysis, data preprocessing, feature\
-      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
-      \ eval data. Do not plot or make any visualizations.\n"
-  concrete-strength:
-    dataset: concrete-strength
-    metric: rmse
-    target_col: Strength
-    user_requirement: "This is a concrete-strength dataset. Your goal is to predict\
-      \ the target column `Strength`.\nPerform data analysis, data preprocessing,\
-      \ feature engineering, and modeling to predict the target. \nReport rmse on\
-      \ the eval data. Do not plot or make any visualizations.\n"
   smoker-status:
     dataset: smoker-status
     metric: f1
@@ -175,4 +159,43 @@ datasets:
       \ the target column `defects`.\nPerform data analysis, data preprocessing, feature\
       \ engineering, and modeling to predict the target. \nReport f1 on the eval data.\
       \ Do not plot or make any visualizations.\n"
-
+  steel-plates-fault:
+    dataset: steel-plates-fault
+    metric: f1 weighted
+    target_col: target
+    user_requirement: "This is a steel-plates-fault dataset. Your goal is to predict\
+      \ the target column `target`.\nPerform data analysis, data preprocessing, feature\
+      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
+      \ eval data. Do not plot or make any visualizations.\n"
+  wine-quality-white:
+    dataset: wine-quality-white
+    metric: f1 weighted
+    target_col: Class
+    user_requirement: "This is a wine-quality-white dataset. Your goal is to predict\
+      \ the target column `Class`.\nPerform data analysis, data preprocessing, feature\
+      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
+      \ eval data. Do not plot or make any visualizations.\n"
+  banking77:
+    dataset: banking77
+    metric: f1 weighted
+    target_col: label
+    user_requirement: "This is a banking77 dataset. Your goal is to predict the target\
+      \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
+      \ and modeling to predict the target. \nReport f1 weighted on the eval data.\
+      \ Do not plot or make any visualizations.\n"
+  gnad10:
+    dataset: gnad10
+    metric: f1 weighted
+    target_col: label
+    user_requirement: "This is a gnad10 dataset. Your goal is to predict the target\
+      \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
+      \ and modeling to predict the target. \nReport f1 weighted on the eval data.\
+      \ Do not plot or make any visualizations.\n"
+  sms_spam:
+    dataset: sms_spam
+    metric: f1
+    target_col: label
+    user_requirement: "This is a sms_spam dataset. Your goal is to predict the target\
+      \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
+      \ and modeling to predict the target. \nReport f1 on the eval data. Do not plot\
+      \ or make any visualizations.\n"

From 9f0427838324f90e4bc9ed62c6eba9e6a2ad4465 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Fri, 13 Sep 2024 11:18:15 +0800
Subject: [PATCH 2/5] comment

---
 expo/data/hf_data.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py
index 6f1f55f6d..952ab5c73 100644
--- a/expo/data/hf_data.py
+++ b/expo/data/hf_data.py
@@ -9,22 +9,25 @@ from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to
 from expo.insights.solution_designer import SolutionDesigner
 
 HFDATSETS = [
-    # {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"},
-    # {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"},
-    # {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"},
-    {
-        "name": "oxford-iiit-pet",
-        "dataset_name": "timm/oxford-iiit-pet",
-        "target_col": "label_cat_dog",
-        "modality": "image",
-    },
-    {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label", "modality": "image"},
-    {
-        "name": "fashion_mnist",
-        "dataset_name": "zalando-datasets/fashion_mnist",
-        "target_col": "label",
-        "modality": "image",
-    },
+    {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"},
+    {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"},
+    {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"},
+    # {
+    #     "name": "oxford-iiit-pet",
+    #     "dataset_name": "timm/oxford-iiit-pet",
+    #     "target_col": "label_cat_dog",
+    #     "modality": "image",
+    # },
+    # {   "name": "stanford_cars",
+    #     "dataset_name": "tanganke/stanford_cars",
+    #     "target_col": "label",
+    #     "modality": "image"},
+    # {
+    #     "name": "fashion_mnist",
+    #     "dataset_name": "zalando-datasets/fashion_mnist",
+    #     "target_col": "label",
+    #     "modality": "image",
+    # },
 ]
 
 

From a6b066a127f7d2fcfa8e6b04dc4bfaed5b20c50a Mon Sep 17 00:00:00 2001
From: limafang <azula_fire@163.com>
Date: Fri, 13 Sep 2024 17:52:22 +0800
Subject: [PATCH 3/5] support image dataset

---
 expo/data/hf_data.py | 72 ++++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 19 deletions(-)

diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py
index 952ab5c73..45ff6330b 100644
--- a/expo/data/hf_data.py
+++ b/expo/data/hf_data.py
@@ -1,7 +1,9 @@
 import asyncio
 import os
 from pathlib import Path
-
+import numpy as np
+from PIL import Image
+import io
 import pandas as pd
 from datasets import load_dataset
 
@@ -9,22 +11,25 @@ from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to
 from expo.insights.solution_designer import SolutionDesigner
 
 HFDATSETS = [
-    {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"},
-    {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"},
-    {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"},
-    # {
-    #     "name": "oxford-iiit-pet",
-    #     "dataset_name": "timm/oxford-iiit-pet",
-    #     "target_col": "label_cat_dog",
-    #     "modality": "image",
-    # },
+    # {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"},
+    # {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"},
+    # {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"},
+    {
+        "name": "oxford-iiit-pet",
+        "dataset_name": "timm/oxford-iiit-pet",
+        "image_col": "image",
+        "target_col": "label_cat_dog",
+        "modality": "image",
+    },
     # {   "name": "stanford_cars",
     #     "dataset_name": "tanganke/stanford_cars",
+    #     "image_col": "image",
     #     "target_col": "label",
     #     "modality": "image"},
     # {
     #     "name": "fashion_mnist",
     #     "dataset_name": "zalando-datasets/fashion_mnist",
+    #     "image_col": "image",
     #     "target_col": "label",
     #     "modality": "image",
     # },
@@ -42,16 +47,22 @@ class HFExpDataset(ExpDataset):
         self.dataset_name = dataset_name
         self.modality = kwargs.get("modality", "")
         self.target_col = kwargs.get("target_col", "label")
+        self.image_col = kwargs.get("image_col", "image")
         self.dataset = load_dataset(self.dataset_name, trust_remote_code=True)
         super().__init__(self.name, dataset_dir, **kwargs)
 
     def get_raw_dataset(self):
         raw_dir = Path(self.dataset_dir, self.name, "raw")
         raw_dir.mkdir(parents=True, exist_ok=True)
+
         if os.path.exists(Path(raw_dir, "train.csv")):
             df = pd.read_csv(Path(raw_dir, "train.csv"), encoding="utf-8")
         else:
             df = self.dataset["train"].to_pandas()
+
+            if self.modality == "image":
+                df = self.save_images_and_update_df(df, raw_dir, "train")
+
             df.to_csv(Path(raw_dir, "train.csv"), index=False, encoding="utf-8")
 
         if os.path.exists(Path(raw_dir, "test.csv")):
@@ -59,19 +70,37 @@ class HFExpDataset(ExpDataset):
         else:
             if self.dataset and "test" in self.dataset:
                 test_df = self.dataset["test"].to_pandas()
+
+                if self.modality == "image":
+                    test_df = self.save_images_and_update_df(test_df, raw_dir, "test")
+
                 test_df.to_csv(Path(raw_dir, "test.csv"), index=False, encoding="utf-8")
             else:
                 test_df = None
+
         return df, test_df
 
+    def save_images_and_update_df(self, df, raw_dir, split):
+        image_dir = Path(raw_dir, f"{split}_images")
+        image_dir.mkdir(parents=True, exist_ok=True)
+
+        def process_image(idx, row):
+            image_bytes = row[self.image_col]["bytes"]
+            image = Image.open(io.BytesIO(image_bytes))
+            if image.mode == "RGBA":
+                image = image.convert("RGB")
+            img_path = Path(image_dir, f"{idx}.jpg")
+            image.save(img_path)
+            return str(img_path)
+
+        df["image"] = df.apply(lambda row: process_image(row.name, row), axis=1)
+        return df
+
     def get_df_head(self, raw_df):
-        if self.modality == "text":
-            examples = []
-            for i in range(5):
-                examples.append(raw_df.iloc[i].to_dict())
-            return examples
-        elif self.modality == "image":
-            return ""
+        examples = []
+        for i in range(5):
+            examples.append(raw_df.iloc[i].to_dict())
+        return examples
 
     def get_dataset_info(self):
         dataset_info = super().get_dataset_info()
@@ -82,7 +111,7 @@ class HFExpDataset(ExpDataset):
 
 if __name__ == "__main__":
     dataset_dir = "D:/work/automl/datasets"
-    save_analysis_pool = False
+    save_analysis_pool = True
     force_update = False
     datasets_dict = {"datasets": {}}
     solution_designer = SolutionDesigner()
@@ -92,8 +121,13 @@ if __name__ == "__main__":
             dataset_dir,
             dataset_meta["dataset_name"],
             target_col=dataset_meta["target_col"],
+            image_col=dataset_meta["image_col"],
             force_update=force_update,
             modality=dataset_meta["modality"],
         )
-        asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict))
+        asyncio.run(
+            process_dataset(
+                hf_dataset, solution_designer, save_analysis_pool, datasets_dict
+            )
+        )
     save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml")

From cfa21ba27e6748229ff61636549c444e25b21904 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Fri, 13 Sep 2024 19:04:48 +0800
Subject: [PATCH 4/5] change img path from abs to rel

---
 expo/data/hf_data.py | 56 ++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py
index 45ff6330b..6f615c8cb 100644
--- a/expo/data/hf_data.py
+++ b/expo/data/hf_data.py
@@ -1,19 +1,19 @@
 import asyncio
+import io
 import os
 from pathlib import Path
-import numpy as np
-from PIL import Image
-import io
+
 import pandas as pd
 from datasets import load_dataset
+from PIL import Image
 
 from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml
 from expo.insights.solution_designer import SolutionDesigner
 
 HFDATSETS = [
-    # {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"},
-    # {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"},
-    # {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"},
+    {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"},
+    {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"},
+    {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"},
     {
         "name": "oxford-iiit-pet",
         "dataset_name": "timm/oxford-iiit-pet",
@@ -21,18 +21,20 @@ HFDATSETS = [
         "target_col": "label_cat_dog",
         "modality": "image",
     },
-    # {   "name": "stanford_cars",
-    #     "dataset_name": "tanganke/stanford_cars",
-    #     "image_col": "image",
-    #     "target_col": "label",
-    #     "modality": "image"},
-    # {
-    #     "name": "fashion_mnist",
-    #     "dataset_name": "zalando-datasets/fashion_mnist",
-    #     "image_col": "image",
-    #     "target_col": "label",
-    #     "modality": "image",
-    # },
+    {
+        "name": "stanford_cars",
+        "dataset_name": "tanganke/stanford_cars",
+        "image_col": "image",
+        "target_col": "label",
+        "modality": "image",
+    },
+    {
+        "name": "fashion_mnist",
+        "dataset_name": "zalando-datasets/fashion_mnist",
+        "image_col": "image",
+        "target_col": "label",
+        "modality": "image",
+    },
 ]
 
 
@@ -81,17 +83,19 @@ class HFExpDataset(ExpDataset):
         return df, test_df
 
     def save_images_and_update_df(self, df, raw_dir, split):
-        image_dir = Path(raw_dir, f"{split}_images")
-        image_dir.mkdir(parents=True, exist_ok=True)
+        abs_image_dir = Path(raw_dir, f"{split}_images")
+        rel_image_dir = f"raw/{split}_images"
+        abs_image_dir.mkdir(parents=True, exist_ok=True)
 
         def process_image(idx, row):
             image_bytes = row[self.image_col]["bytes"]
             image = Image.open(io.BytesIO(image_bytes))
             if image.mode == "RGBA":
                 image = image.convert("RGB")
-            img_path = Path(image_dir, f"{idx}.jpg")
+            img_path = Path(abs_image_dir, f"{idx}.jpg")
+            rel_img_path = f"{rel_image_dir}/{idx}.jpg"
             image.save(img_path)
-            return str(img_path)
+            return rel_img_path
 
         df["image"] = df.apply(lambda row: process_image(row.name, row), axis=1)
         return df
@@ -112,7 +116,7 @@ class HFExpDataset(ExpDataset):
 if __name__ == "__main__":
     dataset_dir = "D:/work/automl/datasets"
     save_analysis_pool = True
-    force_update = False
+    force_update = True
     datasets_dict = {"datasets": {}}
     solution_designer = SolutionDesigner()
     for dataset_meta in HFDATSETS:
@@ -125,9 +129,5 @@ if __name__ == "__main__":
             force_update=force_update,
             modality=dataset_meta["modality"],
         )
-        asyncio.run(
-            process_dataset(
-                hf_dataset, solution_designer, save_analysis_pool, datasets_dict
-            )
-        )
+        asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict))
     save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml")

From 7d8cb9afec8017fa1006df377028f9648c40d3b5 Mon Sep 17 00:00:00 2001
From: Yizhou Chi <chiyizhou@fuzhi.ai>
Date: Fri, 13 Sep 2024 19:10:45 +0800
Subject: [PATCH 5/5] add image datasets config

---
 expo/data/hf_data.py |  6 +++---
 expo/datasets.yaml   | 24 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py
index 6f615c8cb..df3a6ed20 100644
--- a/expo/data/hf_data.py
+++ b/expo/data/hf_data.py
@@ -115,8 +115,8 @@ class HFExpDataset(ExpDataset):
 
 if __name__ == "__main__":
     dataset_dir = "D:/work/automl/datasets"
-    save_analysis_pool = True
-    force_update = True
+    save_analysis_pool = False
+    force_update = False
     datasets_dict = {"datasets": {}}
     solution_designer = SolutionDesigner()
     for dataset_meta in HFDATSETS:
@@ -125,7 +125,7 @@ if __name__ == "__main__":
             dataset_dir,
             dataset_meta["dataset_name"],
             target_col=dataset_meta["target_col"],
-            image_col=dataset_meta["image_col"],
+            image_col=dataset_meta.get("image_col", ""),
             force_update=force_update,
             modality=dataset_meta["modality"],
         )
diff --git a/expo/datasets.yaml b/expo/datasets.yaml
index 051e8232d..92e004c6d 100644
--- a/expo/datasets.yaml
+++ b/expo/datasets.yaml
@@ -183,6 +183,14 @@ datasets:
       \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
       \ and modeling to predict the target. \nReport f1 weighted on the eval data.\
       \ Do not plot or make any visualizations.\n"
+  fashion_mnist:
+    dataset: fashion_mnist
+    metric: f1 weighted
+    target_col: label
+    user_requirement: "This is a fashion_mnist dataset. Your goal is to predict the\
+      \ target column `label`.\nPerform data analysis, data preprocessing, feature\
+      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
+      \ eval data. Do not plot or make any visualizations.\n"
   gnad10:
     dataset: gnad10
     metric: f1 weighted
@@ -191,6 +199,14 @@ datasets:
       \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
       \ and modeling to predict the target. \nReport f1 weighted on the eval data.\
       \ Do not plot or make any visualizations.\n"
+  oxford-iiit-pet:
+    dataset: oxford-iiit-pet
+    metric: f1
+    target_col: label_cat_dog
+    user_requirement: "This is a oxford-iiit-pet dataset. Your goal is to predict\
+      \ the target column `label_cat_dog`.\nPerform data analysis, data preprocessing,\
+      \ feature engineering, and modeling to predict the target. \nReport f1 on the\
+      \ eval data. Do not plot or make any visualizations.\n"
   sms_spam:
     dataset: sms_spam
     metric: f1
@@ -199,3 +215,11 @@ datasets:
       \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
       \ and modeling to predict the target. \nReport f1 on the eval data. Do not plot\
       \ or make any visualizations.\n"
+  stanford_cars:
+    dataset: stanford_cars
+    metric: f1 weighted
+    target_col: label
+    user_requirement: "This is a stanford_cars dataset. Your goal is to predict the\
+      \ target column `label`.\nPerform data analysis, data preprocessing, feature\
+      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
+      \ eval data. Do not plot or make any visualizations.\n"