MetaGPT/expo/dataset.py

313 lines
12 KiB
Python

import openml
from pathlib import Path
from sklearn.model_selection import train_test_split
import os
import json
import yaml
import pandas as pd
from expo.insights.solution_designer import SolutionDesigner
import asyncio
BASE_USER_REQUIREMENT = """\
This is a {datasetname} dataset. Your goal is to predict the target column `{target_col}`.
Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target.
Report {metric} on the eval data. Do not plot or make any visualizations.
"""
TASK_PROMPT = """\
# User requirement
{user_requirement}
**Attention**
1. Please do not leak the target label in any form during training.
2. Dev and Test sets do not have the target column.
3. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
4. If labels are transformed during training, they should be transformed back to the original format before saving the predictions.
## Saving Dev and Test Predictions
1. Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory.
- Both files should contain a single column named `target` with the predicted values.
2. Make sure the prediction results are in the same format as the target column in the training set.
- The labels should be transformed back to the original format if any transformation was applied during training.
## Output Training Set Performance
Make sure the performance of the model is printed in python in the last step even if it has been printed in the previous steps. The value should be a float number.
Print the training set performance in the last step. Write in this format:
```python
...
print("Train score:", train_score)
```
# Data dir
training (with labels): {train_path}
dev (without labels): {dev_path}
testing (without labels): {test_path}
# Output dir
{output_dir}
"""
SEED = 100
TRAIN_TEST_SPLIT = 0.8
TRAIN_DEV_SPLIT = 0.75
OPENML_DATASET_IDS = [
# reg
41021,
42727,
41980,
42225,
531,
# cls
41143,
31,
42733,
41162,
1067,
# multi cls
40498,
40982,
12,
40984,
4538,
]
CUSTOM_DATASETS = [
("04_titanic", "Survived"),
("05_house-prices-advanced-regression-techniques", "SalePrice"),
("06_santander-customer-transaction-prediction", "target"),
("07_icr-identify-age-related-conditions", "Class")
]
def get_split_dataset_path(dataset_name, config):
datasets_dir = config['datasets_dir']
if dataset_name in config['datasets']:
dataset = config['datasets'][dataset_name]
data_path = os.path.join(datasets_dir, dataset['dataset'])
split_datasets = {
"train": os.path.join(data_path, "split_train.csv"),
"dev": os.path.join(data_path, "split_dev.csv"),
"dev_wo_target": os.path.join(data_path, "split_dev_wo_target.csv"),
"dev_target": os.path.join(data_path, "split_dev_target.csv"),
"test": os.path.join(data_path, "split_test.csv"),
"test_wo_target": os.path.join(data_path, "split_test_wo_target.csv"),
"test_target": os.path.join(data_path, "split_test_target.csv"),
}
return split_datasets
else:
raise ValueError(f"Dataset {dataset_name} not found in config file. Available datasets: {config['datasets'].keys()}")
def get_user_requirement(task_name, config):
datasets_dir = config['datasets_dir']
if task_name in config['datasets']:
dataset = config['datasets'][task_name]
data_path = os.path.join(datasets_dir, dataset['dataset'])
user_requirement = dataset['user_requirement']
return data_path, user_requirement
else:
raise ValueError(f"Dataset {task_name} not found in config file. Available datasets: {config['datasets'].keys()}")
def save_datasets_dict_to_yaml(datasets_dict):
with open("datasets.yaml", "w") as file:
yaml.dump(datasets_dict, file)
def create_dataset_dict(dataset):
dataset_dict = {
"dataset": dataset.name,
"user_requirement": dataset.create_base_requirement(),
"metric": dataset.get_metric(),
"target_col": dataset.target_col
}
return dataset_dict
def generate_task_requirement(task_name, data_config):
user_requirement = get_user_requirement(task_name, data_config)
split_dataset_path = get_split_dataset_path(task_name, data_config)
train_path = split_dataset_path["train"]
dev_path = split_dataset_path["dev_wo_target"]
test_path = split_dataset_path["test_wo_target"]
work_dir = data_config["work_dir"]
output_dir = f"{work_dir}/{task_name}"
user_requirement = TASK_PROMPT.format(user_requirement=user_requirement,
train_path=train_path, dev_path=dev_path, test_path=test_path,
output_dir=output_dir)
print(user_requirement)
return user_requirement
class ExpDataset:
description : str = None
metadata : dict = None
dataset_dir : str = None
target_col : str = None
name : str = None
def __init__(self, name, dataset_dir, **kwargs):
self.name = name
self.dataset_dir = dataset_dir
self.target_col = kwargs.get("target_col", None)
self.force_update = kwargs.get("force_update", False)
self.save_dataset(target_col=self.target_col)
def check_dataset_exists(self):
fnames = ["split_train.csv", "split_dev.csv", "split_test.csv",
"split_dev_wo_target.csv", "split_dev_target.csv",
"split_test_wo_target.csv", "split_test_target.csv"]
for fname in fnames:
if not os.path.exists(Path(self.dataset_dir, self.name, fname)):
return False
return True
def check_datasetinfo_exists(self):
return os.path.exists(Path(self.dataset_dir, self.name, "dataset_info.json"))
def get_raw_dataset(self):
raw_dir = Path(self.dataset_dir, self.name, "raw")
if not os.path.exists(Path(raw_dir, "train.csv")):
raise FileNotFoundError(f"Raw dataset `train.csv` not found in {raw_dir}")
else:
df = pd.read_csv(Path(raw_dir, "train.csv"))
return df
def get_dataset_info(self):
raw_df = pd.read_csv(Path(self.dataset_dir, self.name, "raw", "train.csv"))
metadata = {
'NumberOfClasses': raw_df[self.target_col].nunique(),
'NumberOfFeatures': raw_df.shape[1],
'NumberOfInstances': raw_df.shape[0],
'NumberOfInstancesWithMissingValues': int(raw_df.isnull().any(axis=1).sum()),
'NumberOfMissingValues': int(raw_df.isnull().sum().sum()),
'NumberOfNumericFeatures': raw_df.select_dtypes(include=['number']).shape[1],
'NumberOfSymbolicFeatures': raw_df.select_dtypes(include=['object']).shape[1],
}
df_head_text = raw_df.head().to_string(index=False)
dataset_info = {
"name": self.name,
"description": "",
"target_col": self.target_col,
"metadata": metadata,
"df_head": df_head_text
}
return dataset_info
def get_metric(self):
dataset_info = self.get_dataset_info()
num_classes = dataset_info["metadata"]["NumberOfClasses"]
if num_classes == 2:
metric = "f1"
elif 2 < num_classes <= 200:
metric = "f1 weighted"
elif num_classes > 200 or num_classes == 0:
metric = "rmse"
else:
raise ValueError(f"Number of classes {num_classes} not supported")
return metric
def create_base_requirement(self):
metric = self.get_metric()
req = BASE_USER_REQUIREMENT.format(datasetname=self.name, target_col=self.target_col, metric=metric)
return req
def save_dataset(self, target_col):
df = self.get_raw_dataset()
if not self.check_dataset_exists() or self.force_update:
print(f"Saving Dataset {self.name} in {self.dataset_dir}")
self.split_and_save(df, target_col)
else:
print(f"Dataset {self.name} already exists")
if not self.check_datasetinfo_exists() or self.force_update:
print(f"Saving Dataset info for {self.name}")
dataset_info = self.get_dataset_info()
self.save_datasetinfo(dataset_info)
else:
print(f"Dataset info for {self.name} already exists")
def save_datasetinfo(self, dataset_info):
with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w") as file:
json.dump(dataset_info, file, indent=4)
def save_split_datasets(self, df, split, target_col=None):
path = Path(self.dataset_dir, self.name)
df.to_csv(Path(path, f"split_{split}.csv"), index=False)
if target_col:
df_wo_target = df.drop(columns=[target_col])
df_wo_target.to_csv(Path(path, f"split_{split}_wo_target.csv"), index=False)
df_target = df[[target_col]].copy()
if target_col != "target":
df_target["target"] = df_target[target_col]
df_target = df_target.drop(columns=[target_col])
df_target.to_csv(Path(path, f"split_{split}_target.csv"), index=False)
def split_and_save(self, df, target_col):
if not target_col:
raise ValueError("Target column not provided")
train, test = train_test_split(df, test_size=1-TRAIN_TEST_SPLIT, random_state=SEED)
train, dev = train_test_split(train, test_size=1-TRAIN_DEV_SPLIT, random_state=SEED)
self.save_split_datasets(train, "train")
self.save_split_datasets(dev, "dev", target_col)
self.save_split_datasets(test, "test", target_col)
class OpenMLExpDataset(ExpDataset):
def __init__(self, name, dataset_dir, dataset_id, **kwargs):
self.dataset_id = dataset_id
self.dataset = openml.datasets.get_dataset(self.dataset_id,
download_data=False,
download_qualities=False,
download_features_meta_data=True)
self.name = self.dataset.name
self.target_col = self.dataset.default_target_attribute
super().__init__(self.name, dataset_dir, target_col=self.target_col, **kwargs)
def get_raw_dataset(self):
dataset = self.dataset
dataset_df, *_ = dataset.get_data()
raw_dir = Path(self.dataset_dir, self.name, "raw")
os.makedirs(raw_dir, exist_ok=True)
dataset_df.to_csv(Path(raw_dir, "train.csv"), index=False)
return dataset_df
def get_dataset_info(self):
dataset_info = super().get_dataset_info()
dataset = self.dataset
dataset_info["name"] = dataset.name
dataset_info["description"] = dataset.description
dataset_info["metadata"].update(dataset.qualities)
return dataset_info
# class HFExpDataset(ExpDataset):
# def __init__(self, name, dataset_dir, dataset_name, **kwargs):
# super().__init__(name, dataset_dir, **kwargs)
async def process_dataset(dataset, solution_designer, save_analysis_pool, datasets_dict):
if save_analysis_pool:
asyncio.run(solution_designer.generate_solutions(dataset.get_dataset_info(), dataset.name))
dataset_dict = create_dataset_dict(dataset)
datasets_dict["datasets"][dataset.name] = dataset_dict
if __name__ == "__main__":
datasets_dir = "D:/work/automl/datasets"
force_update = False
save_analysis_pool = False
datasets_dict = {"datasets": {}}
solution_designer = SolutionDesigner()
for dataset_id in OPENML_DATASET_IDS:
openml_dataset = OpenMLExpDataset("", datasets_dir, dataset_id, force_update=force_update)
asyncio.run(process_dataset(openml_dataset, solution_designer, save_analysis_pool, datasets_dict))
for dataset_name, target_col in CUSTOM_DATASETS:
custom_dataset = ExpDataset(dataset_name, datasets_dir, target_col=target_col, force_update=force_update)
asyncio.run(process_dataset(custom_dataset, solution_designer, save_analysis_pool, datasets_dict))
save_datasets_dict_to_yaml(datasets_dict)