update text dataset

2026-06-11 15:15:18 +02:00 · 2024-09-13 11:09:02 +08:00 · 2024-09-13 11:09:02 +08:00 · 562af8c4e4
commit 562af8c4e4
parent eaf1b62343
3 changed files with 96 additions and 43 deletions
--- a/expo/data/dataset.py
+++ b/expo/data/dataset.py
@ -268,8 +268,9 @@ class ExpDataset:
            print(f"Dataset info for {self.name} already exists")

    def save_datasetinfo(self, dataset_info):
-        with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w") as file:
-            json.dump(dataset_info, file, indent=4)
+        with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w", encoding="utf-8") as file:
+            # utf-8 encoding is required
+            json.dump(dataset_info, file, indent=4, ensure_ascii=False)

    def save_split_datasets(self, df, split, target_col=None):
        path = Path(self.dataset_dir, self.name)
--- a/expo/data/hf_data.py
+++ b/expo/data/hf_data.py
@ -9,12 +9,22 @@ from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to
 from expo.insights.solution_designer import SolutionDesigner

 HFDATSETS = [
-    {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label"},
-    {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label"},
-    {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label"},
-    {"name": "oxford-iiit-pet", "dataset_name": "timm/oxford-iiit-pet", "target_col": "label"},
-    {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label"},
-    {"name": "fashion_mnist", "dataset_name": "zalando-datasets/fashion_mnist", "target_col": "label"},
+    # {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"},
+    # {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"},
+    # {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"},
+    {
+        "name": "oxford-iiit-pet",
+        "dataset_name": "timm/oxford-iiit-pet",
+        "target_col": "label_cat_dog",
+        "modality": "image",
+    },
+    {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label", "modality": "image"},
+    {
+        "name": "fashion_mnist",
+        "dataset_name": "zalando-datasets/fashion_mnist",
+        "target_col": "label",
+        "modality": "image",
+    },
 ]


@ -27,41 +37,60 @@ class HFExpDataset(ExpDataset):
        self.name = name
        self.dataset_dir = dataset_dir
        self.dataset_name = dataset_name
+        self.modality = kwargs.get("modality", "")
        self.target_col = kwargs.get("target_col", "label")
-        self.dataset = load_dataset(dataset_name)
+        self.dataset = load_dataset(self.dataset_name, trust_remote_code=True)
        super().__init__(self.name, dataset_dir, **kwargs)

    def get_raw_dataset(self):
        raw_dir = Path(self.dataset_dir, self.name, "raw")
        raw_dir.mkdir(parents=True, exist_ok=True)
        if os.path.exists(Path(raw_dir, "train.csv")):
-            df = pd.read_csv(Path(raw_dir, "train.csv"))
+            df = pd.read_csv(Path(raw_dir, "train.csv"), encoding="utf-8")
        else:
            df = self.dataset["train"].to_pandas()
-            df.to_csv(Path(raw_dir, "train.csv"), index=False)
+            df.to_csv(Path(raw_dir, "train.csv"), index=False, encoding="utf-8")

        if os.path.exists(Path(raw_dir, "test.csv")):
-            test_df = pd.read_csv(Path(raw_dir, "test.csv"), index=False)
+            test_df = pd.read_csv(Path(raw_dir, "test.csv"), encoding="utf-8")
        else:
-            if "test" in self.dataset:
+            if self.dataset and "test" in self.dataset:
                test_df = self.dataset["test"].to_pandas()
-                test_df.to_csv(Path(raw_dir, "test.csv"), index=False)
+                test_df.to_csv(Path(raw_dir, "test.csv"), index=False, encoding="utf-8")
            else:
                test_df = None
        return df, test_df

-    # def get_df_head(self, raw_df):
-    #     return raw_df.head()
+    def get_df_head(self, raw_df):
+        if self.modality == "text":
+            examples = []
+            for i in range(5):
+                examples.append(raw_df.iloc[i].to_dict())
+            return examples
+        elif self.modality == "image":
+            return ""
+
+    def get_dataset_info(self):
+        dataset_info = super().get_dataset_info()
+        dataset = self.dataset
+        dataset_info["description"] = dataset["train"].info.description
+        return dataset_info


 if __name__ == "__main__":
    dataset_dir = "D:/work/automl/datasets"
-    save_analysis_pool = True
+    save_analysis_pool = False
+    force_update = False
    datasets_dict = {"datasets": {}}
    solution_designer = SolutionDesigner()
    for dataset_meta in HFDATSETS:
        hf_dataset = HFExpDataset(
-            dataset_meta["name"], dataset_dir, dataset_meta["dataset_name"], target_col=dataset_meta["target_col"]
+            dataset_meta["name"],
+            dataset_dir,
+            dataset_meta["dataset_name"],
+            target_col=dataset_meta["target_col"],
+            force_update=force_update,
+            modality=dataset_meta["modality"],
        )
        asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict))
    save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml")
--- a/expo/datasets.yaml
+++ b/expo/datasets.yaml
@ -79,6 +79,14 @@ datasets:
      \ column `percent_pell_grant`.\nPerform data analysis, data preprocessing, feature\
      \ engineering, and modeling to predict the target. \nReport rmse on the eval\
      \ data. Do not plot or make any visualizations.\n"
+  concrete-strength:
+    dataset: concrete-strength
+    metric: rmse
+    target_col: Strength
+    user_requirement: "This is a concrete-strength dataset. Your goal is to predict\
+      \ the target column `Strength`.\nPerform data analysis, data preprocessing,\
+      \ feature engineering, and modeling to predict the target. \nReport rmse on\
+      \ the eval data. Do not plot or make any visualizations.\n"
  credit-g:
    dataset: credit-g
    metric: f1
@ -135,30 +143,6 @@ datasets:
      \ column `class`.\nPerform data analysis, data preprocessing, feature engineering,\
      \ and modeling to predict the target. \nReport f1 weighted on the eval data.\
      \ Do not plot or make any visualizations.\n"
-  steel-plates-fault:
-    dataset: steel-plates-fault
-    metric: f1 weighted
-    target_col: target
-    user_requirement: "This is a steel-plates-fault dataset. Your goal is to predict\
-      \ the target column `target`.\nPerform data analysis, data preprocessing, feature\
-      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
-      \ eval data. Do not plot or make any visualizations.\n"
-  wine-quality-white:
-    dataset: wine-quality-white
-    metric: f1 weighted
-    target_col: Class
-    user_requirement: "This is a wine-quality-white dataset. Your goal is to predict\
-      \ the target column `Class`.\nPerform data analysis, data preprocessing, feature\
-      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
-      \ eval data. Do not plot or make any visualizations.\n"
-  concrete-strength:
-    dataset: concrete-strength
-    metric: rmse
-    target_col: Strength
-    user_requirement: "This is a concrete-strength dataset. Your goal is to predict\
-      \ the target column `Strength`.\nPerform data analysis, data preprocessing,\
-      \ feature engineering, and modeling to predict the target. \nReport rmse on\
-      \ the eval data. Do not plot or make any visualizations.\n"
  smoker-status:
    dataset: smoker-status
    metric: f1
@ -175,4 +159,43 @@ datasets:
      \ the target column `defects`.\nPerform data analysis, data preprocessing, feature\
      \ engineering, and modeling to predict the target. \nReport f1 on the eval data.\
      \ Do not plot or make any visualizations.\n"
-
+  steel-plates-fault:
+    dataset: steel-plates-fault
+    metric: f1 weighted
+    target_col: target
+    user_requirement: "This is a steel-plates-fault dataset. Your goal is to predict\
+      \ the target column `target`.\nPerform data analysis, data preprocessing, feature\
+      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
+      \ eval data. Do not plot or make any visualizations.\n"
+  wine-quality-white:
+    dataset: wine-quality-white
+    metric: f1 weighted
+    target_col: Class
+    user_requirement: "This is a wine-quality-white dataset. Your goal is to predict\
+      \ the target column `Class`.\nPerform data analysis, data preprocessing, feature\
+      \ engineering, and modeling to predict the target. \nReport f1 weighted on the\
+      \ eval data. Do not plot or make any visualizations.\n"
+  banking77:
+    dataset: banking77
+    metric: f1 weighted
+    target_col: label
+    user_requirement: "This is a banking77 dataset. Your goal is to predict the target\
+      \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
+      \ and modeling to predict the target. \nReport f1 weighted on the eval data.\
+      \ Do not plot or make any visualizations.\n"
+  gnad10:
+    dataset: gnad10
+    metric: f1 weighted
+    target_col: label
+    user_requirement: "This is a gnad10 dataset. Your goal is to predict the target\
+      \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
+      \ and modeling to predict the target. \nReport f1 weighted on the eval data.\
+      \ Do not plot or make any visualizations.\n"
+  sms_spam:
+    dataset: sms_spam
+    metric: f1
+    target_col: label
+    user_requirement: "This is a sms_spam dataset. Your goal is to predict the target\
+      \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
+      \ and modeling to predict the target. \nReport f1 on the eval data. Do not plot\
+      \ or make any visualizations.\n"