import openml from pathlib import Path from sklearn.model_selection import train_test_split import os import json import yaml import pandas as pd from examples.MCTS_test.insights.solution_designer import SolutionDesigner import asyncio BASE_USER_REQUIREMENT = """\ This is a {datasetname} dataset. Your goal is to predict the target column `{target_col}`. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report {metric} on the eval data. Do not plot or make any visualizations. """ SEED = 100 TRAIN_TEST_SPLIT = 0.8 TRAIN_DEV_SPLIT = 0.75 OPENML_DATASET_IDS = [ # reg 41021, 42727, 41980, 42225, 531, # cls 41143, 31, 42733, 41162, 1067, # multi cls 40498, 40982, 12, 40984, 4538, ] CUSTOM_DATASETS = [ ("04_titanic", "Survived"), ("05_house-prices-advanced-regression-techniques", "SalePrice"), ("06_santander-customer-transaction-prediction", "target"), ("07_icr-identify-age-related-conditions", "Class") ] def get_split_dataset_path(dataset_name, config): datasets_dir = config['datasets_dir'] if dataset_name in config['datasets']: dataset = config['datasets'][dataset_name] data_path = os.path.join(datasets_dir, dataset['dataset']) split_datasets = { "train": os.path.join(data_path, "split_train.csv"), "dev": os.path.join(data_path, "split_dev.csv"), "dev_wo_target": os.path.join(data_path, "split_dev_wo_target.csv"), "dev_target": os.path.join(data_path, "split_dev_target.csv"), "test": os.path.join(data_path, "split_test.csv"), "test_wo_target": os.path.join(data_path, "split_test_wo_target.csv"), "test_target": os.path.join(data_path, "split_test_target.csv"), } return split_datasets else: raise ValueError(f"Dataset {dataset_name} not found in config file. Available datasets: {config['datasets'].keys()}") def get_user_requirement(task_name, config): datasets_dir = config['datasets_dir'] if task_name in config['datasets']: dataset = config['datasets'][task_name] data_path = os.path.join(datasets_dir, dataset['dataset']) user_requirement = dataset['user_requirement'] return data_path, user_requirement else: raise ValueError(f"Dataset {task_name} not found in config file. Available datasets: {config['datasets'].keys()}") def save_datasets_dict_to_yaml(datasets_dict): with open("datasets.yaml", "w") as file: yaml.dump(datasets_dict, file) def create_dataset_dict(dataset): dataset_dict = { "dataset": dataset.name, "user_requirement": dataset.create_base_requirement(), "metric": dataset.get_metric() } return dataset_dict class ExpDataset: description : str = None metadata : dict = None dataset_dir : str = None target_col : str = None name : str = None def __init__(self, name, dataset_dir, **kwargs): self.name = name self.dataset_dir = dataset_dir self.target_col = kwargs.get("target_col", None) self.force_update = kwargs.get("force_update", False) self.save_dataset(target_col=self.target_col) def check_dataset_exists(self): fnames = ["split_train.csv", "split_dev.csv", "split_test.csv", "split_dev_wo_target.csv", "split_dev_target.csv", "split_test_wo_target.csv", "split_test_target.csv"] for fname in fnames: if not os.path.exists(Path(self.dataset_dir, self.name, fname)): return False return True def check_datasetinfo_exists(self): return os.path.exists(Path(self.dataset_dir, self.name, "dataset_info.json")) def get_raw_dataset(self): raw_dir = Path(self.dataset_dir, self.name, "raw") if not os.path.exists(Path(raw_dir, "train.csv")): raise FileNotFoundError(f"Raw dataset `train.csv` not found in {raw_dir}") else: df = pd.read_csv(Path(raw_dir, "train.csv")) return df def get_dataset_info(self): raw_df = pd.read_csv(Path(self.dataset_dir, self.name, "raw", "train.csv")) metadata = { 'NumberOfClasses': raw_df[self.target_col].nunique(), 'NumberOfFeatures': raw_df.shape[1], 'NumberOfInstances': raw_df.shape[0], 'NumberOfInstancesWithMissingValues': int(raw_df.isnull().any(axis=1).sum()), 'NumberOfMissingValues': int(raw_df.isnull().sum().sum()), 'NumberOfNumericFeatures': raw_df.select_dtypes(include=['number']).shape[1], 'NumberOfSymbolicFeatures': raw_df.select_dtypes(include=['object']).shape[1], } df_head_text = raw_df.head().to_string(index=False) dataset_info = { "name": self.name, "description": "", "target_col": self.target_col, "metadata": metadata, "df_head": df_head_text } return dataset_info def get_metric(self): dataset_info = self.get_dataset_info() num_classes = dataset_info["metadata"]["NumberOfClasses"] if num_classes == 2: metric = "f1" elif 2 < num_classes <= 200: metric = "f1 weighted" elif num_classes > 200 or num_classes == 0: metric = "rmse" else: raise ValueError(f"Number of classes {num_classes} not supported") return metric def create_base_requirement(self): metric = self.get_metric() req = BASE_USER_REQUIREMENT.format(datasetname=self.name, target_col=self.target_col, metric=metric) return req def save_dataset(self, target_col): df = self.get_raw_dataset() if not self.check_dataset_exists() or self.force_update: print(f"Saving Dataset {self.name} in {self.dataset_dir}") self.split_and_save(df, target_col) else: print(f"Dataset {self.name} already exists") if not self.check_datasetinfo_exists() or self.force_update: print(f"Saving Dataset info for {self.name}") dataset_info = self.get_dataset_info() self.save_datasetinfo(dataset_info) else: print(f"Dataset info for {self.name} already exists") def save_datasetinfo(self, dataset_info): with open(Path(self.dataset_dir, self.name, "dataset_info.json"), "w") as file: json.dump(dataset_info, file, indent=4) def save_split_datasets(self, df, split, target_col=None): path = Path(self.dataset_dir, self.name) df.to_csv(Path(path, f"split_{split}.csv"), index=False) if target_col: df_wo_target = df.drop(columns=[target_col]) df_wo_target.to_csv(Path(path, f"split_{split}_wo_target.csv"), index=False) df_target = df[[target_col]].copy() if target_col != "target": df_target["target"] = df_target[target_col] df_target = df_target.drop(columns=[target_col]) df_target.to_csv(Path(path, f"split_{split}_target.csv"), index=False) def split_and_save(self, df, target_col): if not target_col: raise ValueError("Target column not provided") train, test = train_test_split(df, test_size=1-TRAIN_TEST_SPLIT, random_state=SEED) train, dev = train_test_split(train, test_size=1-TRAIN_DEV_SPLIT, random_state=SEED) self.save_split_datasets(train, "train") self.save_split_datasets(dev, "dev", target_col) self.save_split_datasets(test, "test", target_col) class OpenMLExpDataset(ExpDataset): def __init__(self, name, dataset_dir, dataset_id, **kwargs): self.dataset_id = dataset_id self.dataset = openml.datasets.get_dataset(self.dataset_id, download_data=False, download_qualities=False, download_features_meta_data=True) self.name = self.dataset.name self.target_col = self.dataset.default_target_attribute super().__init__(self.name, dataset_dir, target_col=self.target_col, **kwargs) def get_raw_dataset(self): dataset = self.dataset dataset_df, *_ = dataset.get_data() raw_dir = Path(self.dataset_dir, self.name, "raw") os.makedirs(raw_dir, exist_ok=True) dataset_df.to_csv(Path(raw_dir, "train.csv"), index=False) return dataset_df def get_dataset_info(self): dataset_info = super().get_dataset_info() dataset = self.dataset dataset_info["name"] = dataset.name dataset_info["description"] = dataset.description dataset_info["metadata"].update(dataset.qualities) return dataset_info # class HFExpDataset(ExpDataset): # def __init__(self, name, dataset_dir, dataset_name, **kwargs): # super().__init__(name, dataset_dir, **kwargs) if __name__ == "__main__": datasets_dir = "D:/work/automl/datasets" force_update = True datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() for dataset_id in OPENML_DATASET_IDS: openml_dataset = OpenMLExpDataset("", datasets_dir, dataset_id, force_update=force_update) asyncio.run(solution_designer.generate_solutions(openml_dataset.get_dataset_info(), openml_dataset.name)) dataset_dict = create_dataset_dict(openml_dataset) datasets_dict["datasets"][openml_dataset.name] = dataset_dict for dataset_name, target_col in CUSTOM_DATASETS: custom_dataset = ExpDataset(dataset_name, datasets_dir, target_col=target_col, force_update=force_update) asyncio.run(solution_designer.generate_solutions(custom_dataset.get_dataset_info(), custom_dataset.name)) dataset_dict = create_dataset_dict(custom_dataset) datasets_dict["datasets"][custom_dataset.name] = dataset_dict save_datasets_dict_to_yaml(datasets_dict)