mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-04 21:32:38 +02:00
Merge branch 'text-dataset' into 'expo'
support text dataset See merge request agents/exp_optimizer!11
This commit is contained in:
commit
7458e43edf
3 changed files with 157 additions and 43 deletions
|
|
@ -268,8 +268,9 @@ class ExpDataset:
|
|||
print(f"Dataset info for {self.name} already exists")
|
||||
|
||||
def save_datasetinfo(self, dataset_info):
|
||||
with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w") as file:
|
||||
json.dump(dataset_info, file, indent=4)
|
||||
with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w", encoding="utf-8") as file:
|
||||
# utf-8 encoding is required
|
||||
json.dump(dataset_info, file, indent=4, ensure_ascii=False)
|
||||
|
||||
def save_split_datasets(self, df, split, target_col=None):
|
||||
path = Path(self.dataset_dir, self.name)
|
||||
|
|
|
|||
|
|
@ -1,20 +1,40 @@
|
|||
import asyncio
|
||||
import io
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
from PIL import Image
|
||||
|
||||
from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml
|
||||
from expo.insights.solution_designer import SolutionDesigner
|
||||
|
||||
HFDATSETS = [
|
||||
{"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label"},
|
||||
{"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label"},
|
||||
{"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label"},
|
||||
{"name": "oxford-iiit-pet", "dataset_name": "timm/oxford-iiit-pet", "target_col": "label"},
|
||||
{"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label"},
|
||||
{"name": "fashion_mnist", "dataset_name": "zalando-datasets/fashion_mnist", "target_col": "label"},
|
||||
{"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label", "modality": "text"},
|
||||
{"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label", "modality": "text"},
|
||||
{"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label", "modality": "text"},
|
||||
{
|
||||
"name": "oxford-iiit-pet",
|
||||
"dataset_name": "timm/oxford-iiit-pet",
|
||||
"image_col": "image",
|
||||
"target_col": "label_cat_dog",
|
||||
"modality": "image",
|
||||
},
|
||||
{
|
||||
"name": "stanford_cars",
|
||||
"dataset_name": "tanganke/stanford_cars",
|
||||
"image_col": "image",
|
||||
"target_col": "label",
|
||||
"modality": "image",
|
||||
},
|
||||
{
|
||||
"name": "fashion_mnist",
|
||||
"dataset_name": "zalando-datasets/fashion_mnist",
|
||||
"image_col": "image",
|
||||
"target_col": "label",
|
||||
"modality": "image",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -27,41 +47,87 @@ class HFExpDataset(ExpDataset):
|
|||
self.name = name
|
||||
self.dataset_dir = dataset_dir
|
||||
self.dataset_name = dataset_name
|
||||
self.modality = kwargs.get("modality", "")
|
||||
self.target_col = kwargs.get("target_col", "label")
|
||||
self.dataset = load_dataset(dataset_name)
|
||||
self.image_col = kwargs.get("image_col", "image")
|
||||
self.dataset = load_dataset(self.dataset_name, trust_remote_code=True)
|
||||
super().__init__(self.name, dataset_dir, **kwargs)
|
||||
|
||||
def get_raw_dataset(self):
|
||||
raw_dir = Path(self.dataset_dir, self.name, "raw")
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if os.path.exists(Path(raw_dir, "train.csv")):
|
||||
df = pd.read_csv(Path(raw_dir, "train.csv"))
|
||||
df = pd.read_csv(Path(raw_dir, "train.csv"), encoding="utf-8")
|
||||
else:
|
||||
df = self.dataset["train"].to_pandas()
|
||||
df.to_csv(Path(raw_dir, "train.csv"), index=False)
|
||||
|
||||
if self.modality == "image":
|
||||
df = self.save_images_and_update_df(df, raw_dir, "train")
|
||||
|
||||
df.to_csv(Path(raw_dir, "train.csv"), index=False, encoding="utf-8")
|
||||
|
||||
if os.path.exists(Path(raw_dir, "test.csv")):
|
||||
test_df = pd.read_csv(Path(raw_dir, "test.csv"), index=False)
|
||||
test_df = pd.read_csv(Path(raw_dir, "test.csv"), encoding="utf-8")
|
||||
else:
|
||||
if "test" in self.dataset:
|
||||
if self.dataset and "test" in self.dataset:
|
||||
test_df = self.dataset["test"].to_pandas()
|
||||
test_df.to_csv(Path(raw_dir, "test.csv"), index=False)
|
||||
|
||||
if self.modality == "image":
|
||||
test_df = self.save_images_and_update_df(test_df, raw_dir, "test")
|
||||
|
||||
test_df.to_csv(Path(raw_dir, "test.csv"), index=False, encoding="utf-8")
|
||||
else:
|
||||
test_df = None
|
||||
|
||||
return df, test_df
|
||||
|
||||
# def get_df_head(self, raw_df):
|
||||
# return raw_df.head()
|
||||
def save_images_and_update_df(self, df, raw_dir, split):
|
||||
abs_image_dir = Path(raw_dir, f"{split}_images")
|
||||
rel_image_dir = f"raw/{split}_images"
|
||||
abs_image_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def process_image(idx, row):
|
||||
image_bytes = row[self.image_col]["bytes"]
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
if image.mode == "RGBA":
|
||||
image = image.convert("RGB")
|
||||
img_path = Path(abs_image_dir, f"{idx}.jpg")
|
||||
rel_img_path = f"{rel_image_dir}/{idx}.jpg"
|
||||
image.save(img_path)
|
||||
return rel_img_path
|
||||
|
||||
df["image"] = df.apply(lambda row: process_image(row.name, row), axis=1)
|
||||
return df
|
||||
|
||||
def get_df_head(self, raw_df):
|
||||
examples = []
|
||||
for i in range(5):
|
||||
examples.append(raw_df.iloc[i].to_dict())
|
||||
return examples
|
||||
|
||||
def get_dataset_info(self):
|
||||
dataset_info = super().get_dataset_info()
|
||||
dataset = self.dataset
|
||||
dataset_info["description"] = dataset["train"].info.description
|
||||
return dataset_info
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset_dir = "D:/work/automl/datasets"
|
||||
save_analysis_pool = True
|
||||
save_analysis_pool = False
|
||||
force_update = False
|
||||
datasets_dict = {"datasets": {}}
|
||||
solution_designer = SolutionDesigner()
|
||||
for dataset_meta in HFDATSETS:
|
||||
hf_dataset = HFExpDataset(
|
||||
dataset_meta["name"], dataset_dir, dataset_meta["dataset_name"], target_col=dataset_meta["target_col"]
|
||||
dataset_meta["name"],
|
||||
dataset_dir,
|
||||
dataset_meta["dataset_name"],
|
||||
target_col=dataset_meta["target_col"],
|
||||
image_col=dataset_meta.get("image_col", ""),
|
||||
force_update=force_update,
|
||||
modality=dataset_meta["modality"],
|
||||
)
|
||||
asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict))
|
||||
save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml")
|
||||
|
|
|
|||
|
|
@ -79,6 +79,14 @@ datasets:
|
|||
\ column `percent_pell_grant`.\nPerform data analysis, data preprocessing, feature\
|
||||
\ engineering, and modeling to predict the target. \nReport rmse on the eval\
|
||||
\ data. Do not plot or make any visualizations.\n"
|
||||
concrete-strength:
|
||||
dataset: concrete-strength
|
||||
metric: rmse
|
||||
target_col: Strength
|
||||
user_requirement: "This is a concrete-strength dataset. Your goal is to predict\
|
||||
\ the target column `Strength`.\nPerform data analysis, data preprocessing,\
|
||||
\ feature engineering, and modeling to predict the target. \nReport rmse on\
|
||||
\ the eval data. Do not plot or make any visualizations.\n"
|
||||
credit-g:
|
||||
dataset: credit-g
|
||||
metric: f1
|
||||
|
|
@ -135,30 +143,6 @@ datasets:
|
|||
\ column `class`.\nPerform data analysis, data preprocessing, feature engineering,\
|
||||
\ and modeling to predict the target. \nReport f1 weighted on the eval data.\
|
||||
\ Do not plot or make any visualizations.\n"
|
||||
steel-plates-fault:
|
||||
dataset: steel-plates-fault
|
||||
metric: f1 weighted
|
||||
target_col: target
|
||||
user_requirement: "This is a steel-plates-fault dataset. Your goal is to predict\
|
||||
\ the target column `target`.\nPerform data analysis, data preprocessing, feature\
|
||||
\ engineering, and modeling to predict the target. \nReport f1 weighted on the\
|
||||
\ eval data. Do not plot or make any visualizations.\n"
|
||||
wine-quality-white:
|
||||
dataset: wine-quality-white
|
||||
metric: f1 weighted
|
||||
target_col: Class
|
||||
user_requirement: "This is a wine-quality-white dataset. Your goal is to predict\
|
||||
\ the target column `Class`.\nPerform data analysis, data preprocessing, feature\
|
||||
\ engineering, and modeling to predict the target. \nReport f1 weighted on the\
|
||||
\ eval data. Do not plot or make any visualizations.\n"
|
||||
concrete-strength:
|
||||
dataset: concrete-strength
|
||||
metric: rmse
|
||||
target_col: Strength
|
||||
user_requirement: "This is a concrete-strength dataset. Your goal is to predict\
|
||||
\ the target column `Strength`.\nPerform data analysis, data preprocessing,\
|
||||
\ feature engineering, and modeling to predict the target. \nReport rmse on\
|
||||
\ the eval data. Do not plot or make any visualizations.\n"
|
||||
smoker-status:
|
||||
dataset: smoker-status
|
||||
metric: f1
|
||||
|
|
@ -175,4 +159,67 @@ datasets:
|
|||
\ the target column `defects`.\nPerform data analysis, data preprocessing, feature\
|
||||
\ engineering, and modeling to predict the target. \nReport f1 on the eval data.\
|
||||
\ Do not plot or make any visualizations.\n"
|
||||
|
||||
steel-plates-fault:
|
||||
dataset: steel-plates-fault
|
||||
metric: f1 weighted
|
||||
target_col: target
|
||||
user_requirement: "This is a steel-plates-fault dataset. Your goal is to predict\
|
||||
\ the target column `target`.\nPerform data analysis, data preprocessing, feature\
|
||||
\ engineering, and modeling to predict the target. \nReport f1 weighted on the\
|
||||
\ eval data. Do not plot or make any visualizations.\n"
|
||||
wine-quality-white:
|
||||
dataset: wine-quality-white
|
||||
metric: f1 weighted
|
||||
target_col: Class
|
||||
user_requirement: "This is a wine-quality-white dataset. Your goal is to predict\
|
||||
\ the target column `Class`.\nPerform data analysis, data preprocessing, feature\
|
||||
\ engineering, and modeling to predict the target. \nReport f1 weighted on the\
|
||||
\ eval data. Do not plot or make any visualizations.\n"
|
||||
banking77:
|
||||
dataset: banking77
|
||||
metric: f1 weighted
|
||||
target_col: label
|
||||
user_requirement: "This is a banking77 dataset. Your goal is to predict the target\
|
||||
\ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
|
||||
\ and modeling to predict the target. \nReport f1 weighted on the eval data.\
|
||||
\ Do not plot or make any visualizations.\n"
|
||||
fashion_mnist:
|
||||
dataset: fashion_mnist
|
||||
metric: f1 weighted
|
||||
target_col: label
|
||||
user_requirement: "This is a fashion_mnist dataset. Your goal is to predict the\
|
||||
\ target column `label`.\nPerform data analysis, data preprocessing, feature\
|
||||
\ engineering, and modeling to predict the target. \nReport f1 weighted on the\
|
||||
\ eval data. Do not plot or make any visualizations.\n"
|
||||
gnad10:
|
||||
dataset: gnad10
|
||||
metric: f1 weighted
|
||||
target_col: label
|
||||
user_requirement: "This is a gnad10 dataset. Your goal is to predict the target\
|
||||
\ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
|
||||
\ and modeling to predict the target. \nReport f1 weighted on the eval data.\
|
||||
\ Do not plot or make any visualizations.\n"
|
||||
oxford-iiit-pet:
|
||||
dataset: oxford-iiit-pet
|
||||
metric: f1
|
||||
target_col: label_cat_dog
|
||||
user_requirement: "This is a oxford-iiit-pet dataset. Your goal is to predict\
|
||||
\ the target column `label_cat_dog`.\nPerform data analysis, data preprocessing,\
|
||||
\ feature engineering, and modeling to predict the target. \nReport f1 on the\
|
||||
\ eval data. Do not plot or make any visualizations.\n"
|
||||
sms_spam:
|
||||
dataset: sms_spam
|
||||
metric: f1
|
||||
target_col: label
|
||||
user_requirement: "This is a sms_spam dataset. Your goal is to predict the target\
|
||||
\ column `label`.\nPerform data analysis, data preprocessing, feature engineering,\
|
||||
\ and modeling to predict the target. \nReport f1 on the eval data. Do not plot\
|
||||
\ or make any visualizations.\n"
|
||||
stanford_cars:
|
||||
dataset: stanford_cars
|
||||
metric: f1 weighted
|
||||
target_col: label
|
||||
user_requirement: "This is a stanford_cars dataset. Your goal is to predict the\
|
||||
\ target column `label`.\nPerform data analysis, data preprocessing, feature\
|
||||
\ engineering, and modeling to predict the target. \nReport f1 weighted on the\
|
||||
\ eval data. Do not plot or make any visualizations.\n"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue