MetaGPT/expo/dataset.py

import openml
from pathlib import Path
from sklearn.model_selection import train_test_split
import os
import json
import yaml
import pandas as pd
from expo.insights.solution_designer import SolutionDesigner
import asyncio

BASE_USER_REQUIREMENT = """\
This is a {datasetname} dataset. Your goal is to predict the target column `{target_col}`.
Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target.
Report {metric} on the eval data. Do not plot or make any visualizations.
"""

TASK_PROMPT = """\
# User requirement
{user_requirement}
**Attention**
1. Please do not leak the target label in any form during training.
2. Dev and Test sets do not have the target column.
3. You should perform transformations on train, dev, and test sets at the same time (it's a good idea to define functions for this and avoid code repetition).
4. If labels are transformed during training, they should be transformed back to the original format before saving the predictions.

## Saving Dev and Test Predictions
1. Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory.
- Both files should contain a single column named `target` with the predicted values.
2. Make sure the prediction results are in the same format as the target column in the training set.
- The labels should be transformed back to the original format if any transformation was applied during training.

## Output Training Set Performance
Make sure the performance of the model is printed in python in the last step even if it has been printed in the previous steps. The value should be a float number.
Print the training set performance in the last step. Write in this format:
```python
...
print("Train score:", train_score)
```

# Data dir
training (with labels): {train_path}
dev (without labels): {dev_path}
testing (without labels): {test_path}

# Output dir
{output_dir}

"""


SEED = 100
TRAIN_TEST_SPLIT = 0.8
TRAIN_DEV_SPLIT = 0.75

OPENML_DATASET_IDS = [
    # reg
    41021,
    42727,
    41980,
    42225,
    531,

    # cls
    41143,
    31,
    42733,
    41162,
    1067,

    # multi cls
    40498,
    40982,
    12,
    40984,
    4538,
]

CUSTOM_DATASETS = [
    ("04_titanic", "Survived"),
    ("05_house-prices-advanced-regression-techniques", "SalePrice"),
    ("06_santander-customer-transaction-prediction", "target"),
    ("07_icr-identify-age-related-conditions", "Class")
]

def get_split_dataset_path(dataset_name, config):
    datasets_dir = config['datasets_dir']
    if dataset_name in config['datasets']:
        dataset = config['datasets'][dataset_name]
        data_path = os.path.join(datasets_dir, dataset['dataset'])
        split_datasets = {
            "train": os.path.join(data_path, "split_train.csv"),
            "dev": os.path.join(data_path, "split_dev.csv"),
            "dev_wo_target": os.path.join(data_path, "split_dev_wo_target.csv"),
            "dev_target": os.path.join(data_path, "split_dev_target.csv"),
            "test": os.path.join(data_path, "split_test.csv"),
            "test_wo_target": os.path.join(data_path, "split_test_wo_target.csv"),
            "test_target": os.path.join(data_path, "split_test_target.csv"),
        }
        return split_datasets
    else:
        raise ValueError(f"Dataset {dataset_name} not found in config file. Available datasets: {config['datasets'].keys()}")

def get_user_requirement(task_name, config):
    datasets_dir = config['datasets_dir']
    if task_name in config['datasets']:
        dataset = config['datasets'][task_name]
        data_path = os.path.join(datasets_dir, dataset['dataset'])
        user_requirement = dataset['user_requirement']
        return data_path, user_requirement
    else:
        raise ValueError(f"Dataset {task_name} not found in config file. Available datasets: {config['datasets'].keys()}")


def save_datasets_dict_to_yaml(datasets_dict):
    with open("datasets.yaml", "w") as file:
        yaml.dump(datasets_dict, file)

def create_dataset_dict(dataset):
    dataset_dict = {
        "dataset": dataset.name,
        "user_requirement": dataset.create_base_requirement(),
        "metric": dataset.get_metric(),
        "target_col": dataset.target_col
    }
    return dataset_dict

def generate_task_requirement(task_name, data_config):
    user_requirement = get_user_requirement(task_name, data_config)
    split_dataset_path = get_split_dataset_path(task_name, data_config)
    train_path = split_dataset_path["train"]
    dev_path = split_dataset_path["dev_wo_target"]
    test_path = split_dataset_path["test_wo_target"]
    work_dir = data_config["work_dir"]
    output_dir = f"{work_dir}/{task_name}"
    user_requirement = TASK_PROMPT.format(user_requirement=user_requirement,
                                          train_path=train_path, dev_path=dev_path, test_path=test_path,
                                          output_dir=output_dir)
    print(user_requirement)
    return user_requirement


class ExpDataset:
    description : str = None
    metadata : dict = None
    dataset_dir : str = None
    target_col : str = None
    name : str = None

    def __init__(self, name, dataset_dir, **kwargs):
        self.name = name
        self.dataset_dir = dataset_dir
        self.target_col = kwargs.get("target_col", None)
        self.force_update = kwargs.get("force_update", False)
        self.save_dataset(target_col=self.target_col)

    def check_dataset_exists(self):
        fnames = ["split_train.csv", "split_dev.csv", "split_test.csv",
                  "split_dev_wo_target.csv", "split_dev_target.csv",
                  "split_test_wo_target.csv", "split_test_target.csv"]
        for fname in fnames:
            if not os.path.exists(Path(self.dataset_dir, self.name, fname)):
                return False
        return True

    def check_datasetinfo_exists(self):
        return os.path.exists(Path(self.dataset_dir, self.name, "dataset_info.json"))


    def get_raw_dataset(self):
        raw_dir = Path(self.dataset_dir, self.name, "raw")
        if not os.path.exists(Path(raw_dir, "train.csv")):
            raise FileNotFoundError(f"Raw dataset `train.csv` not found in {raw_dir}")
        else:
            df = pd.read_csv(Path(raw_dir, "train.csv"))
            return df

    def get_dataset_info(self):
        raw_df = pd.read_csv(Path(self.dataset_dir, self.name, "raw", "train.csv"))
        metadata = {
            'NumberOfClasses': raw_df[self.target_col].nunique(),
            'NumberOfFeatures': raw_df.shape[1],
            'NumberOfInstances': raw_df.shape[0],
            'NumberOfInstancesWithMissingValues': int(raw_df.isnull().any(axis=1).sum()),
            'NumberOfMissingValues': int(raw_df.isnull().sum().sum()),
            'NumberOfNumericFeatures': raw_df.select_dtypes(include=['number']).shape[1],
            'NumberOfSymbolicFeatures': raw_df.select_dtypes(include=['object']).shape[1],
        }

        df_head_text = raw_df.head().to_string(index=False)

        dataset_info = {
            "name": self.name,
            "description": "",
            "target_col": self.target_col,
            "metadata": metadata,
            "df_head": df_head_text
        }
        return dataset_info

    def get_metric(self):
        dataset_info = self.get_dataset_info()
        num_classes = dataset_info["metadata"]["NumberOfClasses"]
        if num_classes == 2:
            metric = "f1"
        elif 2 < num_classes <= 200:
            metric = "f1 weighted"
        elif num_classes > 200 or num_classes == 0:
            metric = "rmse"
        else:
            raise ValueError(f"Number of classes {num_classes} not supported")
        return metric

    def create_base_requirement(self):
        metric = self.get_metric()
        req = BASE_USER_REQUIREMENT.format(datasetname=self.name, target_col=self.target_col, metric=metric)
        return req

    def save_dataset(self, target_col):

        df = self.get_raw_dataset()
        if not self.check_dataset_exists() or self.force_update:
            print(f"Saving Dataset {self.name} in {self.dataset_dir}")
            self.split_and_save(df, target_col)
        else:
            print(f"Dataset {self.name} already exists")
        if not self.check_datasetinfo_exists() or self.force_update:
            print(f"Saving Dataset info for {self.name}")
            dataset_info = self.get_dataset_info()
            self.save_datasetinfo(dataset_info)
        else:
            print(f"Dataset info for {self.name} already exists")

    def save_datasetinfo(self, dataset_info):
        with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w") as file:
            json.dump(dataset_info, file, indent=4)

    def save_split_datasets(self, df, split, target_col=None):
        path = Path(self.dataset_dir, self.name)
        df.to_csv(Path(path, f"split_{split}.csv"), index=False)
        if target_col:
            df_wo_target = df.drop(columns=[target_col])
            df_wo_target.to_csv(Path(path, f"split_{split}_wo_target.csv"), index=False)
            df_target = df[[target_col]].copy()
            if target_col != "target":
                df_target["target"] = df_target[target_col]
                df_target = df_target.drop(columns=[target_col])
            df_target.to_csv(Path(path, f"split_{split}_target.csv"), index=False)

    def split_and_save(self, df, target_col):
        if not target_col:
            raise ValueError("Target column not provided")
        train, test = train_test_split(df, test_size=1-TRAIN_TEST_SPLIT, random_state=SEED)
        train, dev = train_test_split(train, test_size=1-TRAIN_DEV_SPLIT, random_state=SEED)
        self.save_split_datasets(train, "train")
        self.save_split_datasets(dev, "dev", target_col)
        self.save_split_datasets(test, "test", target_col)


class OpenMLExpDataset(ExpDataset):
    def __init__(self, name, dataset_dir, dataset_id, **kwargs):
        self.dataset_id = dataset_id
        self.dataset = openml.datasets.get_dataset(self.dataset_id,
                                                   download_data=False,
                                                   download_qualities=False,
                                                   download_features_meta_data=True)
        self.name = self.dataset.name
        self.target_col = self.dataset.default_target_attribute
        super().__init__(self.name, dataset_dir, target_col=self.target_col, **kwargs)


    def get_raw_dataset(self):
        dataset = self.dataset
        dataset_df, *_ = dataset.get_data()
        raw_dir = Path(self.dataset_dir, self.name, "raw")
        os.makedirs(raw_dir, exist_ok=True)
        dataset_df.to_csv(Path(raw_dir, "train.csv"), index=False)
        return dataset_df

    def get_dataset_info(self):
        dataset_info = super().get_dataset_info()
        dataset = self.dataset
        dataset_info["name"] = dataset.name
        dataset_info["description"] = dataset.description
        dataset_info["metadata"].update(dataset.qualities)
        return dataset_info


# class HFExpDataset(ExpDataset):
#     def __init__(self, name, dataset_dir, dataset_name, **kwargs):
#         super().__init__(name, dataset_dir, **kwargs)

async def process_dataset(dataset, solution_designer, save_analysis_pool, datasets_dict):
    if save_analysis_pool:
        asyncio.run(solution_designer.generate_solutions(dataset.get_dataset_info(), dataset.name))
    dataset_dict = create_dataset_dict(dataset)
    datasets_dict["datasets"][dataset.name] = dataset_dict

if __name__ == "__main__":
    datasets_dir = "D:/work/automl/datasets"
    force_update = False
    save_analysis_pool = False
    datasets_dict = {"datasets": {}}
    solution_designer = SolutionDesigner()
    for dataset_id in OPENML_DATASET_IDS:
        openml_dataset = OpenMLExpDataset("", datasets_dir, dataset_id, force_update=force_update)
        asyncio.run(process_dataset(openml_dataset, solution_designer, save_analysis_pool, datasets_dict))

    for dataset_name, target_col in CUSTOM_DATASETS:
        custom_dataset = ExpDataset(dataset_name, datasets_dir, target_col=target_col, force_update=force_update)
        asyncio.run(process_dataset(custom_dataset, solution_designer, save_analysis_pool, datasets_dict))

    save_datasets_dict_to_yaml(datasets_dict)