From 562af8c4e4b2a54275d0501420d4e9b4cd09a612 Mon Sep 17 00:00:00 2001 From: Yizhou Chi Date: Fri, 13 Sep 2024 11:09:02 +0800 Subject: [PATCH 1/5] update text dataset --- expo/data/dataset.py | 5 +-- expo/data/hf_data.py | 61 ++++++++++++++++++++++++++---------- expo/datasets.yaml | 73 +++++++++++++++++++++++++++++--------------- 3 files changed, 96 insertions(+), 43 deletions(-) diff --git a/expo/data/dataset.py b/expo/data/dataset.py index 1494eb267..3b2017d1a 100644 --- a/expo/data/dataset.py +++ b/expo/data/dataset.py @@ -268,8 +268,9 @@ class ExpDataset: print(f"Dataset info for {self.name} already exists") def save_datasetinfo(self, dataset_info): - with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w") as file: - json.dump(dataset_info, file, indent=4) + with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w", encoding="utf-8") as file: + # utf-8 encoding is required + json.dump(dataset_info, file, indent=4, ensure_ascii=False) def save_split_datasets(self, df, split, target_col=None): path = Path(self.dataset_dir, self.name) diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py index 9ed2b2c48..6f1f55f6d 100644 --- a/expo/data/hf_data.py +++ b/expo/data/hf_data.py @@ -9,12 +9,22 @@ from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to from expo.insights.solution_designer import SolutionDesigner HFDATSETS = [ - {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label"}, - {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label"}, - {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label"}, - {"name": "oxford-iiit-pet", "dataset_name": "timm/oxford-iiit-pet", "target_col": "label"}, - {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label"}, - {"name": "fashion_mnist", "dataset_name": "zalando-datasets/fashion_mnist", "target_col": "label"}, + # {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"}, + # {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"}, + # {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"}, + { + "name": "oxford-iiit-pet", + "dataset_name": "timm/oxford-iiit-pet", + "target_col": "label_cat_dog", + "modality": "image", + }, + {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label", "modality": "image"}, + { + "name": "fashion_mnist", + "dataset_name": "zalando-datasets/fashion_mnist", + "target_col": "label", + "modality": "image", + }, ] @@ -27,41 +37,60 @@ class HFExpDataset(ExpDataset): self.name = name self.dataset_dir = dataset_dir self.dataset_name = dataset_name + self.modality = kwargs.get("modality", "") self.target_col = kwargs.get("target_col", "label") - self.dataset = load_dataset(dataset_name) + self.dataset = load_dataset(self.dataset_name, trust_remote_code=True) super().__init__(self.name, dataset_dir, **kwargs) def get_raw_dataset(self): raw_dir = Path(self.dataset_dir, self.name, "raw") raw_dir.mkdir(parents=True, exist_ok=True) if os.path.exists(Path(raw_dir, "train.csv")): - df = pd.read_csv(Path(raw_dir, "train.csv")) + df = pd.read_csv(Path(raw_dir, "train.csv"), encoding="utf-8") else: df = self.dataset["train"].to_pandas() - df.to_csv(Path(raw_dir, "train.csv"), index=False) + df.to_csv(Path(raw_dir, "train.csv"), index=False, encoding="utf-8") if os.path.exists(Path(raw_dir, "test.csv")): - test_df = pd.read_csv(Path(raw_dir, "test.csv"), index=False) + test_df = pd.read_csv(Path(raw_dir, "test.csv"), encoding="utf-8") else: - if "test" in self.dataset: + if self.dataset and "test" in self.dataset: test_df = self.dataset["test"].to_pandas() - test_df.to_csv(Path(raw_dir, "test.csv"), index=False) + test_df.to_csv(Path(raw_dir, "test.csv"), index=False, encoding="utf-8") else: test_df = None return df, test_df - # def get_df_head(self, raw_df): - # return raw_df.head() + def get_df_head(self, raw_df): + if self.modality == "text": + examples = [] + for i in range(5): + examples.append(raw_df.iloc[i].to_dict()) + return examples + elif self.modality == "image": + return "" + + def get_dataset_info(self): + dataset_info = super().get_dataset_info() + dataset = self.dataset + dataset_info["description"] = dataset["train"].info.description + return dataset_info if __name__ == "__main__": dataset_dir = "D:/work/automl/datasets" - save_analysis_pool = True + save_analysis_pool = False + force_update = False datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() for dataset_meta in HFDATSETS: hf_dataset = HFExpDataset( - dataset_meta["name"], dataset_dir, dataset_meta["dataset_name"], target_col=dataset_meta["target_col"] + dataset_meta["name"], + dataset_dir, + dataset_meta["dataset_name"], + target_col=dataset_meta["target_col"], + force_update=force_update, + modality=dataset_meta["modality"], ) asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict)) save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml") diff --git a/expo/datasets.yaml b/expo/datasets.yaml index 512cbc292..051e8232d 100644 --- a/expo/datasets.yaml +++ b/expo/datasets.yaml @@ -79,6 +79,14 @@ datasets: \ column `percent_pell_grant`.\nPerform data analysis, data preprocessing, feature\ \ engineering, and modeling to predict the target. \nReport rmse on the eval\ \ data. Do not plot or make any visualizations.\n" + concrete-strength: + dataset: concrete-strength + metric: rmse + target_col: Strength + user_requirement: "This is a concrete-strength dataset. Your goal is to predict\ + \ the target column `Strength`.\nPerform data analysis, data preprocessing,\ + \ feature engineering, and modeling to predict the target. \nReport rmse on\ + \ the eval data. Do not plot or make any visualizations.\n" credit-g: dataset: credit-g metric: f1 @@ -135,30 +143,6 @@ datasets: \ column `class`.\nPerform data analysis, data preprocessing, feature engineering,\ \ and modeling to predict the target. \nReport f1 weighted on the eval data.\ \ Do not plot or make any visualizations.\n" - steel-plates-fault: - dataset: steel-plates-fault - metric: f1 weighted - target_col: target - user_requirement: "This is a steel-plates-fault dataset. Your goal is to predict\ - \ the target column `target`.\nPerform data analysis, data preprocessing, feature\ - \ engineering, and modeling to predict the target. \nReport f1 weighted on the\ - \ eval data. Do not plot or make any visualizations.\n" - wine-quality-white: - dataset: wine-quality-white - metric: f1 weighted - target_col: Class - user_requirement: "This is a wine-quality-white dataset. Your goal is to predict\ - \ the target column `Class`.\nPerform data analysis, data preprocessing, feature\ - \ engineering, and modeling to predict the target. \nReport f1 weighted on the\ - \ eval data. Do not plot or make any visualizations.\n" - concrete-strength: - dataset: concrete-strength - metric: rmse - target_col: Strength - user_requirement: "This is a concrete-strength dataset. Your goal is to predict\ - \ the target column `Strength`.\nPerform data analysis, data preprocessing,\ - \ feature engineering, and modeling to predict the target. \nReport rmse on\ - \ the eval data. Do not plot or make any visualizations.\n" smoker-status: dataset: smoker-status metric: f1 @@ -175,4 +159,43 @@ datasets: \ the target column `defects`.\nPerform data analysis, data preprocessing, feature\ \ engineering, and modeling to predict the target. \nReport f1 on the eval data.\ \ Do not plot or make any visualizations.\n" - + steel-plates-fault: + dataset: steel-plates-fault + metric: f1 weighted + target_col: target + user_requirement: "This is a steel-plates-fault dataset. Your goal is to predict\ + \ the target column `target`.\nPerform data analysis, data preprocessing, feature\ + \ engineering, and modeling to predict the target. \nReport f1 weighted on the\ + \ eval data. Do not plot or make any visualizations.\n" + wine-quality-white: + dataset: wine-quality-white + metric: f1 weighted + target_col: Class + user_requirement: "This is a wine-quality-white dataset. Your goal is to predict\ + \ the target column `Class`.\nPerform data analysis, data preprocessing, feature\ + \ engineering, and modeling to predict the target. \nReport f1 weighted on the\ + \ eval data. Do not plot or make any visualizations.\n" + banking77: + dataset: banking77 + metric: f1 weighted + target_col: label + user_requirement: "This is a banking77 dataset. Your goal is to predict the target\ + \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\ + \ and modeling to predict the target. \nReport f1 weighted on the eval data.\ + \ Do not plot or make any visualizations.\n" + gnad10: + dataset: gnad10 + metric: f1 weighted + target_col: label + user_requirement: "This is a gnad10 dataset. Your goal is to predict the target\ + \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\ + \ and modeling to predict the target. \nReport f1 weighted on the eval data.\ + \ Do not plot or make any visualizations.\n" + sms_spam: + dataset: sms_spam + metric: f1 + target_col: label + user_requirement: "This is a sms_spam dataset. Your goal is to predict the target\ + \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\ + \ and modeling to predict the target. \nReport f1 on the eval data. Do not plot\ + \ or make any visualizations.\n" From 9f0427838324f90e4bc9ed62c6eba9e6a2ad4465 Mon Sep 17 00:00:00 2001 From: Yizhou Chi Date: Fri, 13 Sep 2024 11:18:15 +0800 Subject: [PATCH 2/5] comment --- expo/data/hf_data.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py index 6f1f55f6d..952ab5c73 100644 --- a/expo/data/hf_data.py +++ b/expo/data/hf_data.py @@ -9,22 +9,25 @@ from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to from expo.insights.solution_designer import SolutionDesigner HFDATSETS = [ - # {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"}, - # {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"}, - # {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"}, - { - "name": "oxford-iiit-pet", - "dataset_name": "timm/oxford-iiit-pet", - "target_col": "label_cat_dog", - "modality": "image", - }, - {"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label", "modality": "image"}, - { - "name": "fashion_mnist", - "dataset_name": "zalando-datasets/fashion_mnist", - "target_col": "label", - "modality": "image", - }, + {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"}, + {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"}, + {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"}, + # { + # "name": "oxford-iiit-pet", + # "dataset_name": "timm/oxford-iiit-pet", + # "target_col": "label_cat_dog", + # "modality": "image", + # }, + # { "name": "stanford_cars", + # "dataset_name": "tanganke/stanford_cars", + # "target_col": "label", + # "modality": "image"}, + # { + # "name": "fashion_mnist", + # "dataset_name": "zalando-datasets/fashion_mnist", + # "target_col": "label", + # "modality": "image", + # }, ] From a6b066a127f7d2fcfa8e6b04dc4bfaed5b20c50a Mon Sep 17 00:00:00 2001 From: limafang Date: Fri, 13 Sep 2024 17:52:22 +0800 Subject: [PATCH 3/5] support image dataset --- expo/data/hf_data.py | 72 ++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py index 952ab5c73..45ff6330b 100644 --- a/expo/data/hf_data.py +++ b/expo/data/hf_data.py @@ -1,7 +1,9 @@ import asyncio import os from pathlib import Path - +import numpy as np +from PIL import Image +import io import pandas as pd from datasets import load_dataset @@ -9,22 +11,25 @@ from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to from expo.insights.solution_designer import SolutionDesigner HFDATSETS = [ - {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"}, - {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"}, - {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"}, - # { - # "name": "oxford-iiit-pet", - # "dataset_name": "timm/oxford-iiit-pet", - # "target_col": "label_cat_dog", - # "modality": "image", - # }, + # {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"}, + # {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"}, + # {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"}, + { + "name": "oxford-iiit-pet", + "dataset_name": "timm/oxford-iiit-pet", + "image_col": "image", + "target_col": "label_cat_dog", + "modality": "image", + }, # { "name": "stanford_cars", # "dataset_name": "tanganke/stanford_cars", + # "image_col": "image", # "target_col": "label", # "modality": "image"}, # { # "name": "fashion_mnist", # "dataset_name": "zalando-datasets/fashion_mnist", + # "image_col": "image", # "target_col": "label", # "modality": "image", # }, @@ -42,16 +47,22 @@ class HFExpDataset(ExpDataset): self.dataset_name = dataset_name self.modality = kwargs.get("modality", "") self.target_col = kwargs.get("target_col", "label") + self.image_col = kwargs.get("image_col", "image") self.dataset = load_dataset(self.dataset_name, trust_remote_code=True) super().__init__(self.name, dataset_dir, **kwargs) def get_raw_dataset(self): raw_dir = Path(self.dataset_dir, self.name, "raw") raw_dir.mkdir(parents=True, exist_ok=True) + if os.path.exists(Path(raw_dir, "train.csv")): df = pd.read_csv(Path(raw_dir, "train.csv"), encoding="utf-8") else: df = self.dataset["train"].to_pandas() + + if self.modality == "image": + df = self.save_images_and_update_df(df, raw_dir, "train") + df.to_csv(Path(raw_dir, "train.csv"), index=False, encoding="utf-8") if os.path.exists(Path(raw_dir, "test.csv")): @@ -59,19 +70,37 @@ class HFExpDataset(ExpDataset): else: if self.dataset and "test" in self.dataset: test_df = self.dataset["test"].to_pandas() + + if self.modality == "image": + test_df = self.save_images_and_update_df(test_df, raw_dir, "test") + test_df.to_csv(Path(raw_dir, "test.csv"), index=False, encoding="utf-8") else: test_df = None + return df, test_df + def save_images_and_update_df(self, df, raw_dir, split): + image_dir = Path(raw_dir, f"{split}_images") + image_dir.mkdir(parents=True, exist_ok=True) + + def process_image(idx, row): + image_bytes = row[self.image_col]["bytes"] + image = Image.open(io.BytesIO(image_bytes)) + if image.mode == "RGBA": + image = image.convert("RGB") + img_path = Path(image_dir, f"{idx}.jpg") + image.save(img_path) + return str(img_path) + + df["image"] = df.apply(lambda row: process_image(row.name, row), axis=1) + return df + def get_df_head(self, raw_df): - if self.modality == "text": - examples = [] - for i in range(5): - examples.append(raw_df.iloc[i].to_dict()) - return examples - elif self.modality == "image": - return "" + examples = [] + for i in range(5): + examples.append(raw_df.iloc[i].to_dict()) + return examples def get_dataset_info(self): dataset_info = super().get_dataset_info() @@ -82,7 +111,7 @@ class HFExpDataset(ExpDataset): if __name__ == "__main__": dataset_dir = "D:/work/automl/datasets" - save_analysis_pool = False + save_analysis_pool = True force_update = False datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() @@ -92,8 +121,13 @@ if __name__ == "__main__": dataset_dir, dataset_meta["dataset_name"], target_col=dataset_meta["target_col"], + image_col=dataset_meta["image_col"], force_update=force_update, modality=dataset_meta["modality"], ) - asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict)) + asyncio.run( + process_dataset( + hf_dataset, solution_designer, save_analysis_pool, datasets_dict + ) + ) save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml") From cfa21ba27e6748229ff61636549c444e25b21904 Mon Sep 17 00:00:00 2001 From: Yizhou Chi Date: Fri, 13 Sep 2024 19:04:48 +0800 Subject: [PATCH 4/5] change img path from abs to rel --- expo/data/hf_data.py | 56 ++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py index 45ff6330b..6f615c8cb 100644 --- a/expo/data/hf_data.py +++ b/expo/data/hf_data.py @@ -1,19 +1,19 @@ import asyncio +import io import os from pathlib import Path -import numpy as np -from PIL import Image -import io + import pandas as pd from datasets import load_dataset +from PIL import Image from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml from expo.insights.solution_designer import SolutionDesigner HFDATSETS = [ - # {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"}, - # {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"}, - # {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"}, + {"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"}, + {"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"}, + {"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"}, { "name": "oxford-iiit-pet", "dataset_name": "timm/oxford-iiit-pet", @@ -21,18 +21,20 @@ HFDATSETS = [ "target_col": "label_cat_dog", "modality": "image", }, - # { "name": "stanford_cars", - # "dataset_name": "tanganke/stanford_cars", - # "image_col": "image", - # "target_col": "label", - # "modality": "image"}, - # { - # "name": "fashion_mnist", - # "dataset_name": "zalando-datasets/fashion_mnist", - # "image_col": "image", - # "target_col": "label", - # "modality": "image", - # }, + { + "name": "stanford_cars", + "dataset_name": "tanganke/stanford_cars", + "image_col": "image", + "target_col": "label", + "modality": "image", + }, + { + "name": "fashion_mnist", + "dataset_name": "zalando-datasets/fashion_mnist", + "image_col": "image", + "target_col": "label", + "modality": "image", + }, ] @@ -81,17 +83,19 @@ class HFExpDataset(ExpDataset): return df, test_df def save_images_and_update_df(self, df, raw_dir, split): - image_dir = Path(raw_dir, f"{split}_images") - image_dir.mkdir(parents=True, exist_ok=True) + abs_image_dir = Path(raw_dir, f"{split}_images") + rel_image_dir = f"raw/{split}_images" + abs_image_dir.mkdir(parents=True, exist_ok=True) def process_image(idx, row): image_bytes = row[self.image_col]["bytes"] image = Image.open(io.BytesIO(image_bytes)) if image.mode == "RGBA": image = image.convert("RGB") - img_path = Path(image_dir, f"{idx}.jpg") + img_path = Path(abs_image_dir, f"{idx}.jpg") + rel_img_path = f"{rel_image_dir}/{idx}.jpg" image.save(img_path) - return str(img_path) + return rel_img_path df["image"] = df.apply(lambda row: process_image(row.name, row), axis=1) return df @@ -112,7 +116,7 @@ class HFExpDataset(ExpDataset): if __name__ == "__main__": dataset_dir = "D:/work/automl/datasets" save_analysis_pool = True - force_update = False + force_update = True datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() for dataset_meta in HFDATSETS: @@ -125,9 +129,5 @@ if __name__ == "__main__": force_update=force_update, modality=dataset_meta["modality"], ) - asyncio.run( - process_dataset( - hf_dataset, solution_designer, save_analysis_pool, datasets_dict - ) - ) + asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict)) save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml") From 7d8cb9afec8017fa1006df377028f9648c40d3b5 Mon Sep 17 00:00:00 2001 From: Yizhou Chi Date: Fri, 13 Sep 2024 19:10:45 +0800 Subject: [PATCH 5/5] add image datasets config --- expo/data/hf_data.py | 6 +++--- expo/datasets.yaml | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py index 6f615c8cb..df3a6ed20 100644 --- a/expo/data/hf_data.py +++ b/expo/data/hf_data.py @@ -115,8 +115,8 @@ class HFExpDataset(ExpDataset): if __name__ == "__main__": dataset_dir = "D:/work/automl/datasets" - save_analysis_pool = True - force_update = True + save_analysis_pool = False + force_update = False datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() for dataset_meta in HFDATSETS: @@ -125,7 +125,7 @@ if __name__ == "__main__": dataset_dir, dataset_meta["dataset_name"], target_col=dataset_meta["target_col"], - image_col=dataset_meta["image_col"], + image_col=dataset_meta.get("image_col", ""), force_update=force_update, modality=dataset_meta["modality"], ) diff --git a/expo/datasets.yaml b/expo/datasets.yaml index 051e8232d..92e004c6d 100644 --- a/expo/datasets.yaml +++ b/expo/datasets.yaml @@ -183,6 +183,14 @@ datasets: \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\ \ and modeling to predict the target. \nReport f1 weighted on the eval data.\ \ Do not plot or make any visualizations.\n" + fashion_mnist: + dataset: fashion_mnist + metric: f1 weighted + target_col: label + user_requirement: "This is a fashion_mnist dataset. Your goal is to predict the\ + \ target column `label`.\nPerform data analysis, data preprocessing, feature\ + \ engineering, and modeling to predict the target. \nReport f1 weighted on the\ + \ eval data. Do not plot or make any visualizations.\n" gnad10: dataset: gnad10 metric: f1 weighted @@ -191,6 +199,14 @@ datasets: \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\ \ and modeling to predict the target. \nReport f1 weighted on the eval data.\ \ Do not plot or make any visualizations.\n" + oxford-iiit-pet: + dataset: oxford-iiit-pet + metric: f1 + target_col: label_cat_dog + user_requirement: "This is a oxford-iiit-pet dataset. Your goal is to predict\ + \ the target column `label_cat_dog`.\nPerform data analysis, data preprocessing,\ + \ feature engineering, and modeling to predict the target. \nReport f1 on the\ + \ eval data. Do not plot or make any visualizations.\n" sms_spam: dataset: sms_spam metric: f1 @@ -199,3 +215,11 @@ datasets: \ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\ \ and modeling to predict the target. \nReport f1 on the eval data. Do not plot\ \ or make any visualizations.\n" + stanford_cars: + dataset: stanford_cars + metric: f1 weighted + target_col: label + user_requirement: "This is a stanford_cars dataset. Your goal is to predict the\ + \ target column `label`.\nPerform data analysis, data preprocessing, feature\ + \ engineering, and modeling to predict the target. \nReport f1 weighted on the\ + \ eval data. Do not plot or make any visualizations.\n"